From c0bf108c177b29a4f100f3625de7a3f40464148c Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Thu, 11 Jul 2024 14:33:42 -0700
Subject: [PATCH 01/42] test_utils refactor, local_cpu_allocator

---
 .../include/kernels/local_cpu_allocator.h     |  22 ++
 lib/kernels/src/local_cpu_allocator.cc        |  35 ++
 lib/kernels/test/src/test_attention_kernel.cc |  26 +-
 .../test/src/test_batch_matmul_kernel.cc      |  10 +-
 .../test/src/test_batch_norm_kernel.cc        |  54 ++--
 lib/kernels/test/src/test_cast_kernel.cc      |  84 +++--
 lib/kernels/test/src/test_combine_kernel.cc   |  81 ++++-
 lib/kernels/test/src/test_concat_kernel.cc    |  33 +-
 lib/kernels/test/src/test_dropout.cc          |  19 +-
 lib/kernels/test/src/test_flat_kernel.cc      |  40 +--
 lib/kernels/test/src/test_gather_kernels.cc   |  29 +-
 .../test/src/test_layer_norm_kernels.cc       |  25 +-
 lib/kernels/test/src/test_partition_kernel.cc |  41 +--
 lib/kernels/test/src/test_pool_2d_kernels.cc  |  32 +-
 lib/kernels/test/src/test_reduction_kernel.cc |  37 +--
 lib/kernels/test/src/test_replicate_kernel.cc | 102 ++++--
 lib/kernels/test/src/test_reshape_kernel.cc   |  35 +-
 lib/kernels/test/src/test_reverse_kernels.cc  | 103 +++++-
 lib/kernels/test/src/test_softmax_kernel.cc   |  25 +-
 lib/kernels/test/src/test_split_kernel.cc     |  19 +-
 lib/kernels/test/src/test_transpose_kernel.cc |  29 +-
 lib/kernels/test/src/test_utils.cc            | 304 +++++++++++++-----
 lib/kernels/test/src/test_utils.h             |  88 ++++-
 23 files changed, 842 insertions(+), 431 deletions(-)
 create mode 100644 lib/kernels/include/kernels/local_cpu_allocator.h
 create mode 100644 lib/kernels/src/local_cpu_allocator.cc
diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h
new file mode 100644
index 0000000000..27dcc9d854
--- /dev/null
+++ b/lib/kernels/include/kernels/local_cpu_allocator.h
@@ -0,0 +1,22 @@
+#include "kernels/allocation.h"
+#include <unordered_set>
+
+namespace FlexFlow {
+
+struct LocalCPUAllocator : public IAllocator {
+  LocalCPUAllocator() = default;
+  LocalCPUAllocator(LocalCPUAllocator const &) = delete;
+  LocalCPUAllocator(LocalCPUAllocator &&) = delete;
+  ~LocalCPUAllocator() override;
+
+  void *allocate(size_t) override;
+  void deallocate(void *) override;
+
+private:
+  std::unordered_set<void *> ptrs;
+};
+CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator);
+
+Allocator create_local_cpu_memory_allocator();
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc
new file mode 100644
index 0000000000..6553dc2f88
--- /dev/null
+++ b/lib/kernels/src/local_cpu_allocator.cc
@@ -0,0 +1,35 @@
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/device.h"
+
+namespace FlexFlow {
+void *LocalCPUAllocator::allocate(size_t requested_memory_size) {
+  void *ptr = malloc(requested_memory_size);
+  if (ptr != nullptr) {
+    this->ptrs.insert(ptr);
+  } else {
+    throw std::bad_alloc();
+  }
+  return ptr;
+}
+
+void LocalCPUAllocator::deallocate(void *ptr) {
+  if (contains(this->ptrs, ptr)) {
+    free(ptr);
+    this->ptrs.erase(ptr);
+  } else {
+    throw std::runtime_error(
+        "Deallocating a pointer that was not allocated by this Allocator");
+  }
+}
+
+LocalCPUAllocator::~LocalCPUAllocator() {
+  for (auto ptr : ptrs) {
+    free(ptr);
+  }
+}
+
+Allocator create_local_cpu_memory_allocator() {
+  return Allocator::create<LocalCPUAllocator>();
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index d44129ece1..023233ecb0 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -13,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     size_t qoSeqLength = 20, kvSeqLength = 20;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -33,16 +35,16 @@ TEST_SUITE(FF_TEST_SUITE) {
                                                  kvSeqLength,
                                                  false);
 
-    TensorShape query_shape = make_float_tensor_shape_from_legion_dims(
-        {qoSeqLength, num_samples, qSize});
-    TensorShape key_shape = make_float_tensor_shape_from_legion_dims(
-        {kvSeqLength, num_samples, kSize});
-    TensorShape value_shape = make_float_tensor_shape_from_legion_dims(
-        {kvSeqLength, num_samples, vSize});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
-        {qoSeqLength, num_samples, oProjSize});
+    TensorShape query_shape = make_tensor_shape_from_legion_dims(
+        {qoSeqLength, num_samples, qSize}, DataType::FLOAT);
+    TensorShape key_shape = make_tensor_shape_from_legion_dims(
+        {kvSeqLength, num_samples, kSize}, DataType::FLOAT);
+    TensorShape value_shape = make_tensor_shape_from_legion_dims(
+        {kvSeqLength, num_samples, vSize}, DataType::FLOAT);
+    TensorShape output_shape = make_tensor_shape_from_legion_dims(
+        {qoSeqLength, num_samples, oProjSize}, DataType::FLOAT);
     TensorShape weight_shape =
-        make_float_tensor_shape_from_legion_dims({state.weightSize});
+        make_tensor_shape_from_legion_dims({state.weightSize}, DataType::FLOAT);
 
     GenericTensorAccessorW query_accessor =
         create_random_filled_accessor_w(query_shape, allocator);
@@ -66,9 +68,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           weight_accessor.get_float_ptr(),
           output_accessor.get_float_ptr());
 
-      std::vector<float> host_output = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index 18e6977148..8a11a069f5 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -15,16 +15,18 @@ TEST_SUITE(FF_TEST_SUITE) {
     size_t seq_length = -1;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape_a =
-        make_float_tensor_shape_from_legion_dims({m, k, batch});
+        make_tensor_shape_from_legion_dims({m, k, batch}, DataType::FLOAT);
     TensorShape input_shape_b =
-        make_float_tensor_shape_from_legion_dims({k, n, batch});
+        make_tensor_shape_from_legion_dims({k, n, batch}, DataType::FLOAT);
     TensorShape output_shape =
-        make_float_tensor_shape_from_legion_dims({m, n, batch});
+        make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT);
 
     GenericTensorAccessorW a_accessor =
         create_random_filled_accessor_w(input_shape_a, allocator);
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index 8487bbda6a..03a3a1ad40 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -1,5 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/batch_norm_kernels.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
@@ -9,7 +10,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     size_t output_n = 1, output_c = 10, output_h = 10, output_w = 10;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -23,25 +26,25 @@ TEST_SUITE(FF_TEST_SUITE) {
                                         output_w,
                                         true);
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
-    TensorShape scale_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
-    TensorShape bias_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
+    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        {output_n, output_c, output_h, output_w}, DataType::FLOAT);
+    TensorShape output_shape = make_tensor_shape_from_legion_dims(
+        {output_n, output_c, output_h, output_w}, DataType::FLOAT);
+    TensorShape scale_shape = make_tensor_shape_from_legion_dims(
+        {output_n, output_c, output_h, output_w}, DataType::FLOAT);
+    TensorShape bias_shape = make_tensor_shape_from_legion_dims(
+        {output_n, output_c, output_h, output_w}, DataType::FLOAT);
 
     GenericTensorAccessorW input_accessor =
         create_random_filled_accessor_w(input_shape, allocator);
     GenericTensorAccessorW output_accessor =
         create_random_filled_accessor_w(output_shape, allocator);
-    GenericTensorAccessorW scale_accessor =
-        create_filled_accessor_w(scale_shape, allocator, 1.0f);
+    GenericTensorAccessorW scale_accessor = create_filled_accessor_w(
+        scale_shape, allocator, make_float_data_type_value(1));
 
     SUBCASE("forward_kernel") {
-      GenericTensorAccessorW bias_accessor =
-          create_filled_accessor_w(bias_shape, allocator, 0.0f);
+      GenericTensorAccessorW bias_accessor = create_filled_accessor_w(
+          bias_shape, allocator, make_float_data_type_value(0));
 
       Kernels::BatchNorm::forward_kernel(managed_stream.raw_stream(),
                                          state,
@@ -50,10 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                          scale_accessor.get_float_ptr(),
                                          bias_accessor.get_float_ptr());
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
@@ -68,28 +68,18 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(),
                                           state,
-                                          input_accessor.get_float_ptr(),
-                                          output_grad_accessor.get_float_ptr(),
                                           output_accessor.get_float_ptr(),
+                                          output_grad_accessor.get_float_ptr(),
+                                          input_accessor.get_float_ptr(),
                                           input_grad_accessor.get_float_ptr(),
                                           scale_accessor.get_float_ptr(),
                                           scale_grad_accessor.get_float_ptr(),
                                           bias_grad_accessor.get_float_ptr(),
                                           input_accessor.shape.num_elements());
 
-      std::vector<float> host_input_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      std::vector<float> host_scale_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(scale_grad_accessor));
-      std::vector<float> host_bias_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(bias_grad_accessor));
-
-      CHECK(contains_non_zero(host_input_grad_data));
-      CHECK(contains_non_zero(host_scale_grad_data));
-      CHECK(contains_non_zero(host_bias_grad_data));
+      CHECK(contains_non_zero(input_grad_accessor));
+      CHECK(contains_non_zero(scale_grad_accessor));
+      CHECK(contains_non_zero(bias_grad_accessor));
     }
 
     Kernels::BatchNorm::cleanup_kernel(allocator,
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index b110208bce..1be5839a9c 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -1,7 +1,7 @@
 #include "doctest/doctest.h"
 #include "kernels/cast_kernels.h"
+#include "kernels/cast_kernels_cpu.h"
 #include "test_utils.h"
-#include <type_traits>
 
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
@@ -11,46 +11,68 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({100, 100});
+        make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT);
     TensorShape output_shape =
-        make_double_tensor_shape_from_legion_dims({100, 100});
-
-    GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w(output_shape, allocator);
+        make_tensor_shape_from_legion_dims({100, 100}, DataType::DOUBLE);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
-
-      Kernels::Cast::forward_kernel(managed_stream.raw_stream(),
-                                    input_accessor,
-                                    output_accessor,
-                                    DataType::FLOAT,
-                                    DataType::DOUBLE);
+          create_random_filled_accessor_r(input_shape, allocator);
+      GenericTensorAccessorW output_accessor =
+          allocator.allocate_tensor(output_shape);
 
-      std::vector<double> host_double_data =
-          load_data_to_host_from_device<double>(
-              read_only_accessor_from_write_accessor(output_accessor));
+      Kernels::Cast::forward_kernel(
+          managed_stream.raw_stream(), input_accessor, output_accessor);
 
-      CHECK(contains_non_zero(host_double_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
+      GenericTensorAccessorR grad_output_accessor =
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW grad_input_accessor =
-          allocator.allocate_tensor(input_shape);
-
-      Kernels::Cast::backward_kernel(
-          managed_stream.raw_stream(),
-          read_only_accessor_from_write_accessor(output_accessor),
-          grad_input_accessor,
-          DataType::DOUBLE,
-          DataType::FLOAT);
-
-      std::vector<float> host_grad_float_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(grad_input_accessor));
-      CHECK(contains_non_zero(host_grad_float_data));
+          create_zero_filled_accessor_w(input_shape, allocator);
+
+      Kernels::Cast::backward_kernel(managed_stream.raw_stream(),
+                                     grad_output_accessor,
+                                     grad_input_accessor);
+
+      CHECK(contains_non_zero(grad_input_accessor));
+    }
+  }
+
+  TEST_CASE("Check Cast Forward Kernel against CPU Kernel") {
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({10, 2}, DataType::FLOAT);
+    TensorShape output_shape =
+        make_tensor_shape_from_legion_dims({10, 2}, DataType::DOUBLE);
+
+    // Only calling forward kernel as backward kernel is exactly the same
+    SUBCASE("forward_kernel") {
+      // Run GPU Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          create_zero_filled_accessor_w(output_shape, gpu_allocator);
+
+      Kernels::Cast::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      // Run CPU Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          create_zero_filled_accessor_w(output_shape, cpu_allocator);
+
+      Kernels::Cast::cpu_forward_kernel(input_accessor_cpu,
+                                        output_accessor_cpu);
+
+      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index 2e1000cb95..a4688a1030 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -1,39 +1,37 @@
 #include "doctest/doctest.h"
 #include "kernels/combine_kernels.h"
+#include "kernels/combine_kernels_cpu.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Test combine kernel") {
-    ManagedPerDeviceFFHandle managed_handle{};
+  TEST_CASE("Call Combine Forward and Backward Kernels") {
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({100, 100});
+        make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Combine::forward_kernel(
           managed_stream.raw_stream(), input_accessor, output_accessor);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
@@ -41,9 +39,64 @@ TEST_SUITE(FF_TEST_SUITE) {
                                         output_grad_accessor,
                                         input_grad_accessor);
 
-      std::vector<float> host_input_grad = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_input_grad));
+      CHECK(contains_non_zero(input_grad_accessor));
+    }
+  }
+
+  TEST_CASE("Check Combine Forward Kernel against CPU Kernel") {
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({5, 5}, DataType::FLOAT);
+    TensorShape output_shape = input_shape;
+
+    SUBCASE("forward_kernel") {
+      // Run GPU Combine Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          gpu_allocator.allocate_tensor(output_shape);
+
+      Kernels::Combine::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      // Run CPU Combine Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          cpu_allocator.allocate_tensor(output_shape);
+
+      Kernels::Combine::cpu_forward_kernel(input_accessor_cpu,
+                                           output_accessor_cpu);
+
+      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Combine Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r(output_shape, gpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          create_zero_filled_accessor_w(input_shape, gpu_allocator);
+
+      Kernels::Combine::backward_kernel(managed_stream.raw_stream(),
+                                        output_grad_accessor_gpu,
+                                        input_grad_accessor_gpu);
+
+      // Run CPU Combine Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          create_zero_filled_accessor_w(input_shape, cpu_allocator);
+
+      Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu,
+                                            input_grad_accessor_cpu);
+
+      CHECK(accessors_are_equal(input_grad_accessor_gpu,
+                                input_grad_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index 2212e384fa..4607171a54 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -1,21 +1,24 @@
 #include "doctest/doctest.h"
 #include "kernels/concat_kernels.h"
 #include "test_utils.h"
+#include "utils/containers/repeat.h"
 
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test concat kernel forward and backward") {
-    size_t num_inputs = 3;
-    size_t size_per_input = 100;
-    ff_dim_t concat_axis = ff_dim_t{nonnegative_int{0}};
+    size_t num_inputs = 2;
+    size_t size_per_input = 10;
+    ff_dim_t concat_axis = ff_dim_t{1};
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({size_per_input});
-    TensorShape output_shape =
-        make_float_tensor_shape_from_legion_dims({size_per_input, num_inputs});
+        make_tensor_shape_from_legion_dims({size_per_input}, DataType::FLOAT);
+    TensorShape output_shape = make_tensor_shape_from_legion_dims(
+        {num_inputs, size_per_input}, DataType::FLOAT);
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -33,21 +36,15 @@ TEST_SUITE(FF_TEST_SUITE) {
                                       input_accessors,
                                       concat_axis);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
-      std::vector<GenericTensorAccessorW> input_grad_accessors =
-          repeat<GenericTensorAccessorW>(num_inputs, [&]() {
-            return allocator.allocate_tensor(input_shape);
-          });
+          create_random_filled_accessor_r(output_shape, allocator);
+      std::vector<GenericTensorAccessorW> input_grad_accessors = repeat(
+          num_inputs, [&]() { return allocator.allocate_tensor(input_shape); });
+
       Kernels::Concat::backward_kernel(managed_stream.raw_stream(),
                                        output_grad_accessor,
                                        input_grad_accessors,
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index e29143e251..4be2bdf7bb 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -1,6 +1,7 @@
 #include "doctest/doctest.h"
 #include "kernels/dropout_kernels.h"
 #include "test_utils.h"
+#include "utils/containers/count.h"
 
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
@@ -13,11 +14,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10, 10});
+        make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -25,14 +28,12 @@ TEST_SUITE(FF_TEST_SUITE) {
         managed_handle.raw_handle(), dropout_rate, seed, shape, allocator);
 
     auto get_zero_count = [](std::vector<float> const &data) {
-      return std::count_if(
-          data.begin(), data.end(), [](float x) { return x == 0.0f; });
+      return count(data, [](float x) { return x == 0.0f; });
     };
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
@@ -41,11 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        input_accessor.get_float_ptr(),
                                        output_accessor.get_float_ptr());
 
-      std::vector<float> host_output_accessor =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      CHECK(contains_non_zero(host_output_accessor));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index 70894858e3..0bb69aa1dc 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -1,5 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/flat_kernels.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
@@ -7,15 +8,18 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Flat Kernel") {
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     GenericTensorAccessorR input_accessor =
-        read_only_accessor_from_write_accessor(
-            create_filled_accessor_w(input_shape, allocator, 2.0f));
+        read_only_accessor_from_write_accessor(create_filled_accessor_w(
+            input_shape, allocator, make_float_data_type_value(2)));
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW output_accessor =
@@ -25,33 +29,21 @@ TEST_SUITE(FF_TEST_SUITE) {
                                     input_accessor,
                                     output_accessor.get_float_ptr());
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements(), 2.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW output_grad_accessor =
-          create_filled_accessor_w(output_shape, allocator, 0.0f);
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 1.0f);
+      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
+          output_shape, allocator, make_float_data_type_value(0));
+      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
+          input_shape, allocator, make_float_data_type_value(1));
 
       Kernels::Flat::backward_kernel(managed_stream.raw_stream(),
                                      input_accessor,
-                                     input_grad_accessor.get_float_ptr(),
-                                     output_grad_accessor.get_float_ptr());
-
-      std::vector<float> backward_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
+                                     output_grad_accessor.get_float_ptr(),
+                                     input_grad_accessor.get_float_ptr());
 
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements(), 1.0f);
-      CHECK(backward_output_data == expected_output_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 88ac2f6889..7f97563217 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -5,24 +5,26 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Gather Forward and Backward Kernel") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     GatherPerDeviceState state = {managed_handle.raw_handle(), legion_dim_t(2)};
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50});
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
+    TensorShape output_shape =
+        make_tensor_shape_from_legion_dims({50}, DataType::FLOAT);
 
     GenericTensorAccessorR index_accessor =
-        read_only_accessor_from_write_accessor(
-            create_random_filled_accessor_w(output_shape, allocator));
+        create_random_filled_accessor_r(output_shape, allocator);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
@@ -32,16 +34,12 @@ TEST_SUITE(FF_TEST_SUITE) {
                                       index_accessor,
                                       output_accessor);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
 
@@ -51,10 +49,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        index_accessor,
                                        input_grad_accessor);
 
-      std::vector<float> host_input_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_input_grad_data));
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 03b2f56bb9..7d7298f83d 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -1,5 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/layer_norm_kernels.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
@@ -11,13 +12,15 @@ TEST_SUITE(FF_TEST_SUITE) {
     float epsilon = 1e-5f;
     bool elementwise_affine = true;
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({batch_size, feature_size});
+    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        {batch_size, feature_size}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
     TensorShape feature_shape =
-        make_float_tensor_shape_from_legion_dims({feature_size});
+        make_tensor_shape_from_legion_dims({feature_size}, DataType::FLOAT);
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -31,16 +34,15 @@ TEST_SUITE(FF_TEST_SUITE) {
                                         epsilon);
 
     GenericTensorAccessorR input_accessor =
-        read_only_accessor_from_write_accessor(
-            create_random_filled_accessor_w(input_shape, allocator));
-    GenericTensorAccessorW gamma_accessor =
-        create_filled_accessor_w(feature_shape, allocator, 1.0f);
+        create_random_filled_accessor_r(input_shape, allocator);
+    GenericTensorAccessorW gamma_accessor = create_filled_accessor_w(
+        feature_shape, allocator, make_float_data_type_value(1));
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
-      GenericTensorAccessorW beta_accessor =
-          create_filled_accessor_w(feature_shape, allocator, 0.0f);
+      GenericTensorAccessorW beta_accessor = create_filled_accessor_w(
+          feature_shape, allocator, make_float_data_type_value(0));
 
       Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(),
                                          state,
@@ -52,8 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
       GenericTensorAccessorW gamma_grad_accessor =
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
index 437b37e954..e88c811803 100644
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ b/lib/kernels/test/src/test_partition_kernel.cc
@@ -1,12 +1,15 @@
 #include "doctest/doctest.h"
 #include "kernels/partition_kernels.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Partition Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -15,47 +18,33 @@ TEST_SUITE(FF_TEST_SUITE) {
         managed_handle.raw_handle(), DataType::FLOAT);
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10, 10});
+        make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
-      GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+      GenericTensorAccessorR input_accessor = create_filled_accessor_r(
+          input_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Repartition::forward_kernel(
           managed_stream.raw_stream(), state, input_accessor, output_accessor);
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements(), 1.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 2.0f);
+      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
+          output_shape, allocator, make_float_data_type_value(1));
+      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
+          input_shape, allocator, make_float_data_type_value(2));
 
       Kernels::Repartition::backward_kernel(managed_stream.raw_stream(),
                                             state,
-                                            input_grad_accessor,
-                                            output_grad_accessor);
-
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
+                                            output_grad_accessor,
+                                            input_grad_accessor);
 
-      std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements(), 3.0f);
-      CHECK(host_grad_input_data == expected_grad_input_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index ebb92d39db..00fa968235 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -1,5 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/pool_2d_kernels.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
@@ -12,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     PoolOp pool_type = PoolOp::MAX;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -36,10 +39,10 @@ TEST_SUITE(FF_TEST_SUITE) {
                                      stride_w,
                                      pool_type);
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
-        {input_w, input_h, input_c, input_n});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
-        {output_w, output_h, output_c, output_n});
+    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        {input_w, input_h, input_c, input_n}, DataType::FLOAT);
+    TensorShape output_shape = make_tensor_shape_from_legion_dims(
+        {output_w, output_h, output_c, output_n}, DataType::FLOAT);
 
     GenericTensorAccessorW input_accessor =
         create_random_filled_accessor_w(input_shape, allocator);
@@ -52,28 +55,23 @@ TEST_SUITE(FF_TEST_SUITE) {
                                       input_accessor.ptr,
                                       output_accessor.ptr);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW output_grad_accessor =
-          create_filled_accessor_w(output_shape, allocator, 1.0f);
+      GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w(
+          output_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
       Kernels::Pool2D::backward_kernel(managed_stream.raw_stream(),
                                        state,
-                                       input_accessor.ptr,
-                                       input_grad_accessor.ptr,
                                        output_accessor.ptr,
-                                       output_grad_accessor.ptr);
+                                       output_grad_accessor.ptr,
+                                       input_accessor.ptr,
+                                       input_grad_accessor.ptr);
 
-      std::vector<float> host_input_grad = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_input_grad));
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index 1ea740f336..1c389cb20d 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -1,5 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/reduction_kernels.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
@@ -7,20 +8,22 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Reduction Forward and Backward Kernel") {
     std::size_t num_replicas = 5;
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10, 10, 10, 10, 10});
+    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        {10, 10, 10, 10, 10}, DataType::FLOAT);
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     SUBCASE("forward_kernel") {
-      TensorShape output_shape = make_float_tensor_shape_from_legion_dims({10});
+      TensorShape output_shape =
+          make_tensor_shape_from_legion_dims({10}, DataType::FLOAT);
 
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
@@ -29,30 +32,22 @@ TEST_SUITE(FF_TEST_SUITE) {
                                          output_accessor,
                                          num_replicas);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       TensorShape output_shape = input_shape;
 
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
+      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
+          output_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
       Kernels::Reduction::backward_kernel(managed_stream.raw_stream(),
-                                          input_grad_accessor,
-                                          output_grad_accessor);
-
-      std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements(), 1.0f);
-      std::vector<float> host_grad_data = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(host_grad_data == expected_grad_input_data);
+                                          output_grad_accessor,
+                                          input_grad_accessor);
+
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 86d790f03c..27223cc7b5 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -1,55 +1,113 @@
 #include "doctest/doctest.h"
 #include "kernels/replicate_kernels.h"
+#include "kernels/replicate_kernels_cpu.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Test Replicate Kernel") {
+  TEST_CASE("Call Replicate Forward and Backward Kernels") {
     std::size_t num_replicas = 10;
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
-    TensorShape output_shape = input_shape;
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
+    TensorShape output_shape =
+        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Replicate::forward_kernel(
           managed_stream.raw_stream(), input_accessor, output_accessor);
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements(), 1.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 1.0f);
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
+          create_random_filled_accessor_r(output_shape, allocator);
+      GenericTensorAccessorW input_grad_accessor =
+          allocator.allocate_tensor(input_shape);
 
       Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
-                                          input_grad_accessor,
                                           output_grad_accessor,
+                                          input_grad_accessor,
+                                          num_replicas);
+
+      CHECK(contains_non_zero(input_grad_accessor));
+    }
+  }
+
+  TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") {
+    std::size_t num_replicas = 2;
+
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({5}, DataType::FLOAT);
+    TensorShape output_shape =
+        make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT);
+
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("forward_kernel") {
+      // Run GPU Replicate Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          create_zero_filled_accessor_w(output_shape, gpu_allocator);
+
+      Kernels::Replicate::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      // Run CPU Replicate Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          create_zero_filled_accessor_w(output_shape, cpu_allocator);
+
+      Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu,
+                                             output_accessor_cpu);
+
+      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Replicate Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r(output_shape, gpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          create_zero_filled_accessor_w(input_shape, gpu_allocator);
+
+      Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
+                                          output_grad_accessor_gpu,
+                                          input_grad_accessor_gpu,
                                           num_replicas);
 
-      std::vector<float> check_aggregated_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(check_aggregated_data));
+      // Run CPU Replicate Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          create_zero_filled_accessor_w(input_shape, cpu_allocator);
+
+      Kernels::Replicate::cpu_backward_kernel(
+          output_grad_accessor_cpu, input_grad_accessor_cpu, num_replicas);
+
+      CHECK(accessors_are_equal(input_grad_accessor_gpu,
+                                input_grad_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc
index f56bfacc2b..5c04012da2 100644
--- a/lib/kernels/test/src/test_reshape_kernel.cc
+++ b/lib/kernels/test/src/test_reshape_kernel.cc
@@ -5,12 +5,15 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Reshape Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     ReshapePerDeviceState state =
@@ -18,42 +21,28 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Reshape::forward_kernel(
           managed_stream.raw_stream(), state, input_accessor, output_accessor);
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements(), 1.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 2.0f);
+          allocator.allocate_tensor(input_shape);
 
       Kernels::Reshape::backward_kernel(managed_stream.raw_stream(),
                                         state,
-                                        input_grad_accessor,
-                                        output_grad_accessor);
-
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
+                                        output_grad_accessor,
+                                        input_grad_accessor);
 
-      std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements(), 3.0f);
-      CHECK(host_grad_input_data == expected_grad_input_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index cdaf65a305..4adf79847a 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -1,5 +1,7 @@
 #include "doctest/doctest.h"
 #include "kernels/reverse_kernels.h"
+#include "kernels/reverse_kernels_cpu.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
@@ -9,18 +11,21 @@ TEST_SUITE(FF_TEST_SUITE) {
     std::size_t in_blk_size = 10;
     std::size_t num_out_blks = 1;
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
+    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+          read_only_accessor_from_write_accessor(create_filled_accessor_w(
+              input_shape, allocator, make_float_data_type_value(1)));
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
@@ -32,17 +37,14 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        in_blk_size,
                                        input_accessor.shape.num_elements());
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(check_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_accessor =
           create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
+          allocator.allocate_tensor(input_shape);
 
       Kernels::Reverse::backward_kernel(
           managed_stream.raw_stream(),
@@ -53,10 +55,85 @@ TEST_SUITE(FF_TEST_SUITE) {
           in_blk_size,
           input_grad_accessor.shape.num_elements());
 
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_grad_input_data));
+      CHECK(contains_non_zero(input_grad_accessor));
+    }
+  }
+
+  TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") {
+    std::size_t num_out_blks = 4;
+    std::size_t reverse_dim_size = 3;
+    std::size_t in_blk_size = 2;
+
+    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT);
+    TensorShape output_shape = input_shape;
+
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("forward_kernel") {
+      auto transform = [counter = 0.0f](float val) mutable {
+        return counter++;
+      };
+
+      // Run GPU Cast Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          create_zero_filled_accessor_w(output_shape, gpu_allocator);
+
+      Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
+                                       input_accessor_gpu.get_float_ptr(),
+                                       output_accessor_gpu.get_float_ptr(),
+                                       num_out_blks,
+                                       reverse_dim_size,
+                                       in_blk_size,
+                                       input_accessor_gpu.shape.num_elements());
+
+      // Run CPU Cast Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          create_zero_filled_accessor_w(output_shape, cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu,
+                                           output_accessor_cpu);
+
+      CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu));
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Cast Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r(output_shape, gpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          create_zero_filled_accessor_w(input_shape, gpu_allocator);
+
+      Kernels::Reverse::backward_kernel(
+          managed_stream.raw_stream(),
+          output_grad_accessor_gpu.get_float_ptr(),
+          input_grad_accessor_gpu.get_float_ptr(),
+          num_out_blks,
+          reverse_dim_size,
+          in_blk_size,
+          input_grad_accessor_gpu.shape.num_elements());
+
+      // Run CPU Cast Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          create_zero_filled_accessor_w(input_shape, cpu_allocator);
+
+      Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu,
+                                            input_grad_accessor_cpu);
+
+      CHECK(accessors_are_equal(input_grad_accessor_gpu,
+                                input_grad_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index f49c1ebbcc..5519c30b80 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -8,12 +8,15 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Softmax Kernel Operations") {
     int input_n = 1, input_c = 1, input_h = 1, input_w = 100, channels = 100;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SoftmaxPerDeviceState state = Kernels::Softmax::init_kernel(
@@ -31,30 +34,22 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        input_accessor.get_float_ptr(),
                                        output_accessor.get_float_ptr());
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW output_grad_accessor =
-          create_filled_accessor_w(output_shape, allocator, 1.0f);
+      GenericTensorAccessorR output_grad_accessor =
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
       Kernels::Softmax::backward_kernel(
           managed_stream.raw_stream(),
-          input_grad_accessor.get_float_ptr(),
           output_grad_accessor.get_float_ptr(),
+          input_grad_accessor.get_float_ptr(),
           output_grad_accessor.shape.num_elements());
 
-      std::vector<float> expected_input_grad_data =
-          std::vector<float>(input_grad_accessor.shape.num_elements(), 1.0f);
-      std::vector<float> host_input_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(host_input_grad_data == expected_input_grad_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index f2346c9244..34993fa151 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -1,6 +1,8 @@
 #include "doctest/doctest.h"
 #include "kernels/split_kernels.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
+#include "utils/containers/repeat.h"
 
 using namespace ::FlexFlow;
 
@@ -11,20 +13,23 @@ TEST_SUITE(FF_TEST_SUITE) {
     coord_t in_blk_size = 100;
     coord_t num_blks = 1;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50});
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
+    TensorShape output_shape =
+        make_tensor_shape_from_legion_dims({50}, DataType::FLOAT);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
 
-      std::vector<float *> output_ptrs(num_outputs);
-      generate_n(output_ptrs.begin(), num_outputs, [&]() {
+      std::vector<float *> output_ptrs = repeat(num_outputs, [&]() {
         GenericTensorAccessorW output_accessor =
             allocator.allocate_tensor(output_shape);
         return output_accessor.get_float_ptr();
@@ -47,8 +52,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         output_grad_ptrs[i] = output_grad_accessor.get_float_ptr();
       }
 
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 0.0f);
+      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
+          input_shape, allocator, make_float_data_type_value(0));
 
       Kernels::Split::backward_kernel(managed_stream.raw_stream(),
                                       input_grad_accessor.get_float_ptr(),
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index 2904fa01ae..0bc85cb8e0 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -7,10 +7,11 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Transpose Kernel Operations") {
     std::size_t num_dims = 2;
 
-    std::vector<ff_dim_t> perm = {ff_dim_t{nonnegative_int{0}},
-                                  ff_dim_t{nonnegative_int{1}}};
+    std::vector<ff_dim_t> perm = {ff_dim_t{0}, ff_dim_t{1}};
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -19,41 +20,33 @@ TEST_SUITE(FF_TEST_SUITE) {
         Kernels::Transpose::init_kernel(num_dims, perm);
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10, 10});
+        make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Transpose::forward_kernel(
           managed_stream.raw_stream(), state, input_accessor, output_accessor);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
 
       Kernels::Transpose::backward_kernel(managed_stream.raw_stream(),
                                           state,
-                                          input_grad_accessor,
-                                          output_grad_accessor);
+                                          output_grad_accessor,
+                                          input_grad_accessor);
 
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_grad_input_data));
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
index b591642570..bfed1241ba 100644
--- a/lib/kernels/test/src/test_utils.cc
+++ b/lib/kernels/test/src/test_utils.cc
@@ -1,105 +1,249 @@
 #include "test_utils.h"
+#include "op-attrs/tensor_shape.h"
+#include <random>
 
-GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
-                                                       Allocator &allocator,
-                                                       bool cpu_fill) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements();
-  std::vector<float> host_data(volume);
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-
-  for (auto &val : host_data) {
-    val = dist(gen);
+namespace FlexFlow {
+
+GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
+                                                     Allocator &allocator) {
+  GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape);
+  fill_with_zeros(result_accessor);
+  return result_accessor;
+}
+
+TensorShape
+    make_tensor_shape_from_legion_dims(LegionOrdered<size_t> const &dims,
+                                       DataType DT) {
+  return TensorShape{
+      TensorDims{
+          ff_ordered_from_legion_ordered(dims),
+      },
+      DT,
+  };
+}
+
+template <DataType DT>
+struct CreateRandomFilledAccessorW {
+  GenericTensorAccessorW operator()(TensorShape const &shape,
+                                    Allocator &allocator) {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape);
+
+    using T = real_type_t<DT>;
+    T *data_ptr = src_accessor.get<DT>();
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    size_t num_elements = get_num_elements(shape);
+    if constexpr (std::is_same<T, bool>::value) {
+      std::bernoulli_distribution dist(0.5);
+      for (size_t i = 0; i < num_elements; i++) {
+        data_ptr[i] = dist(gen);
+      }
+    } else if constexpr (std::is_floating_point<T>::value) {
+      std::uniform_real_distribution<T> dist(-1.0, 1.0);
+      for (size_t i = 0; i < num_elements; i++) {
+        data_ptr[i] = dist(gen);
+      }
+    } else if constexpr (std::is_integral<T>::value) {
+      std::uniform_int_distribution<T> dist(0, 100);
+      for (size_t i = 0; i < num_elements; i++) {
+        data_ptr[i] = dist(gen);
+      }
+    }
+
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+
+    return dst_accessor;
   }
+};
 
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
+GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
+                                                       Allocator &allocator) {
+  return DataTypeDispatch1<CreateRandomFilledAccessorW>{}(
+      shape.data_type, shape, allocator);
+}
+
+GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape,
+                                                       Allocator &allocator) {
+  GenericTensorAccessorW accessor =
+      create_random_filled_accessor_w(shape, allocator);
+
+  return read_only_accessor_from_write_accessor(accessor);
+}
+
+template <DataType DT>
+struct FillWithZeros {
+  void operator()(GenericTensorAccessorW const &accessor) {
+    using T = real_type_t<DT>;
+
+    if (accessor.device_type == DeviceType::CPU) {
+      memset(accessor.ptr, 0, accessor.shape.get_volume() * sizeof(T));
+    } else {
+      checkCUDA(
+          cudaMemset(accessor.ptr, 0, accessor.shape.get_volume() * sizeof(T)));
+    }
   }
+};
 
-  return accessor;
+void fill_with_zeros(GenericTensorAccessorW const &accessor) {
+  DataTypeDispatch1<FillWithZeros>{}(accessor.data_type, accessor);
 }
 
-GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
-                                                Allocator &allocator,
-                                                float val,
-                                                bool cpu_fill) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements();
-  std::vector<float> host_data(volume, val);
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
+template <DataType DT>
+struct CPUAccessorRContainsNonZero {
+  bool operator()(GenericTensorAccessorR const &accessor) {
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = accessor.get<DT>();
+
+    for (size_t i = 0; i < accessor.shape.num_elements(); i++) {
+      if (data_ptr[i] != 0) {
+        return true;
+      }
+    }
+
+    return false;
   }
+};
 
-  return accessor;
+bool contains_non_zero(GenericTensorAccessorR const &accessor) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR cpu_accessor =
+      copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator);
+  return DataTypeDispatch1<CPUAccessorRContainsNonZero>{}(
+      cpu_accessor.data_type, cpu_accessor);
 }
 
-GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape,
-                                                     Allocator &allocator,
-                                                     bool cpu_fill) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements();
-  std::vector<float> host_data(volume);
+GenericTensorAccessorR
+    copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor,
+                                        Allocator &cpu_allocator) {
+  GenericTensorAccessorR cpu_accessor = accessor;
+  if (accessor.device_type == DeviceType::GPU) {
+    cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator);
+  }
+  return cpu_accessor;
+}
 
-  for (size_t i = 0; i < volume; i++) {
-    host_data[i] = i;
+GenericTensorAccessorW
+    copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor,
+                                        Allocator &cpu_allocator) {
+  GenericTensorAccessorW cpu_accessor = accessor;
+  if (accessor.device_type == DeviceType::GPU) {
+    cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator);
   }
+  return cpu_accessor;
+}
+
+template <DataType DT>
+struct Print2DCPUAccessorR {
+  void operator()(GenericTensorAccessorR const &accessor,
+                  std::ostream &stream) {
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = accessor.get<DT>();
+    int rows = accessor.shape.at(legion_dim_t{0});
+    int cols = accessor.shape.at(legion_dim_t{1});
 
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < cols; j++) {
+        stream << data_ptr[i * cols + j];
+
+        if (j < cols - 1) {
+          stream << " ";
+        }
+      }
+      stream << std::endl;
+    }
   }
+};
 
-  return accessor;
+void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor,
+                                       std::ostream &stream) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR cpu_accessor =
+      copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator);
+  DataTypeDispatch1<Print2DCPUAccessorR>{}(
+      accessor.data_type, accessor, stream);
 }
 
-void fill_tensor_accessor_w(GenericTensorAccessorW accessor,
-                            float val,
-                            bool cpu_fill) {
-  LegionTensorDims dims = accessor.shape.dims;
-  size_t volume = accessor.shape.num_elements();
-  std::vector<float> host_data(volume, val);
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
+template <DataType DT>
+struct AccessorsAreEqual {
+  bool operator()(GenericTensorAccessorR const &accessor_a,
+                  GenericTensorAccessorR const &accessor_b) {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    GenericTensorAccessorR cpu_accessor_a =
+        copy_accessor_r_to_cpu_if_necessary(accessor_a, cpu_allocator);
+    GenericTensorAccessorR cpu_accessor_b =
+        copy_accessor_r_to_cpu_if_necessary(accessor_b, cpu_allocator);
+
+    using T = real_type_t<DT>;
+    T const *a_data_ptr = cpu_accessor_a.get<DT>();
+    T const *b_data_ptr = cpu_accessor_b.get<DT>();
+
+    for (size_t i = 0; i < accessor_a.shape.num_elements(); i++) {
+      if (a_data_ptr[i] != b_data_ptr[i]) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+};
+
+bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
+                         GenericTensorAccessorR const &accessor_b) {
+  if (accessor_a.shape != accessor_b.shape) {
+    throw mk_runtime_error(
+        fmt::format("accessors_are_equal expected accessors to have the same "
+                    "shape, but received: {} != {}",
+                    accessor_a.shape,
+                    accessor_b.shape));
   }
+  return DataTypeDispatch1<AccessorsAreEqual>{}(
+      accessor_a.data_type, accessor_a, accessor_b);
 }
 
-TensorShape make_float_tensor_shape_from_legion_dims(FFOrdered<size_t> dims) {
-  return TensorShape{
-      TensorDims{
-          dims,
-      },
-      DataType::FLOAT,
-  };
+template <DataType DT>
+struct CreateFilledAccessorW {
+  GenericTensorAccessorW operator()(TensorShape const &shape,
+                                    Allocator &allocator,
+                                    DataTypeValue val) {
+    using T = real_type_t<DT>;
+    if (!val.template has<T>()) {
+      throw mk_runtime_error("create_filed_accessor expected data type of "
+                             "shape and passed-in value to match");
+    }
+
+    auto unwrapped_value = val.get<T>();
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape);
+
+    T *data_ptr = src_accessor.get<DT>();
+    for (size_t i = 0; i < dst_accessor.shape.num_elements(); i++) {
+      data_ptr[i] = unwrapped_value;
+    }
+
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+    return dst_accessor;
+  }
+};
+
+GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val) {
+
+  return DataTypeDispatch1<CreateFilledAccessorW>{}(
+      shape.data_type, shape, allocator, val);
 }
 
-TensorShape make_double_tensor_shape_from_legion_dims(FFOrdered<size_t> dims) {
-  return TensorShape{
-      TensorDims{
-          dims,
-      },
-      DataType::DOUBLE,
-  };
+GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val) {
+  GenericTensorAccessorW w_accessor =
+      create_filled_accessor_w(shape, allocator, val);
+  return read_only_accessor_from_write_accessor(w_accessor);
 }
+} // namespace FlexFlow
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
index 21d4923881..8a063fea17 100644
--- a/lib/kernels/test/src/test_utils.h
+++ b/lib/kernels/test/src/test_utils.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_KERNELS_TEST_UTILS
 
 #include "kernels/device.h"
+#include "kernels/local_cpu_allocator.h"
 #include "kernels/local_cuda_allocator.h"
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
@@ -13,35 +14,96 @@
 
 using namespace FlexFlow;
 
+template <typename DT>
 GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
                                                        Allocator &allocator,
-                                                       bool cpu_fill = false);
+                                                       bool cpu_fill = false) {
+  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
+  size_t volume = accessor.shape.num_elements();
+  std::vector<DT> host_data(volume);
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<DT> dist(-1.0f, 1.0f);
 
+  for (auto &val : host_data) {
+    val = dist(gen);
+  }
+
+  if (cpu_fill) {
+    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT));
+  } else {
+    checkCUDA(cudaMemcpy(accessor.ptr,
+                         host_data.data(),
+                         host_data.size() * sizeof(DT),
+                         cudaMemcpyHostToDevice));
+  }
+
+  return accessor;
+}
+
+template <typename DT>
 GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
                                                 Allocator &allocator,
-                                                float val,
-                                                bool cpu_fill = false);
+                                                DT val,
+                                                bool cpu_fill = false) {
+  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
+  size_t volume = accessor.shape.num_elements();
+  std::vector<DT> host_data(volume, val);
+
+  if (cpu_fill) {
+    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT));
+  } else {
+    checkCUDA(cudaMemcpy(accessor.ptr,
+                         host_data.data(),
+                         host_data.size() * sizeof(DT),
+                         cudaMemcpyHostToDevice));
+  }
 
+  return accessor;
+}
+
+template <typename DT>
 GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape,
                                                      Allocator &allocator,
-                                                     bool cpu_fill = false);
+                                                     bool cpu_fill = false) {
+  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
+  size_t volume = accessor.shape.num_elements();
+  std::vector<DT> host_data(volume);
+
+  for (size_t i = 0; i < volume; i++) {
+    host_data[i] = i;
+  }
 
-void fill_tensor_accessor_w(GenericTensorAccessorW accessor,
-                            float val,
-                            bool cpu_fill = false);
+  if (cpu_fill) {
+    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT));
+  } else {
+    checkCUDA(cudaMemcpy(accessor.ptr,
+                         host_data.data(),
+                         host_data.size() * sizeof(DT),
+                         cudaMemcpyHostToDevice));
+  }
 
-TensorShape make_float_tensor_shape_from_legion_dims(FFOrdered<size_t> dims);
+  return accessor;
+}
 
-TensorShape make_double_tensor_shape_from_legion_dims(FFOrdered<size_t> dims);
+template <DataType DT>
+TensorShape make_tensor_shape_from_legion_dims(FFOrdered<size_t> dims) {
+  return TensorShape{
+      TensorDims{
+          dims,
+      },
+      DT,
+  };
+}
 
-template <typename T>
-std::vector<T> load_data_to_host_from_device(GenericTensorAccessorR accessor) {
+template <typename DT>
+std::vector<DT> load_data_to_host_from_device(GenericTensorAccessorR accessor) {
   int volume = accessor.shape.get_volume();
 
-  std::vector<T> local_data(volume);
+  std::vector<DT> local_data(volume);
   checkCUDA(cudaMemcpy(local_data.data(),
                        accessor.ptr,
-                       local_data.size() * sizeof(T),
+                       local_data.size() * sizeof(DT),
                        cudaMemcpyDeviceToHost));
   return local_data;
 }

From 47ad0d83e894ff78304a81d3f0464b779c8e1420 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Fri, 12 Jul 2024 12:54:48 -0700
Subject: [PATCH 02/42] test utils modification, cast, reverse, and replicate
 cpu kernels

---
 .../include/kernels/cast_kernels_cpu.h        |  27 ++++
 .../include/kernels/replicate_kernels_cpu.h   |  24 +++
 .../include/kernels/reverse_kernels_cpu.h     |  29 ++++
 lib/kernels/src/cpu/cast_kernels.cc           |  59 ++++++++
 lib/kernels/src/cpu/combine_kernels.cc        |   0
 lib/kernels/src/cpu/replicate_kernels.cc      |  61 ++++++++
 lib/kernels/src/cpu/reverse_kernels.cc        |  49 +++++++
 lib/kernels/src/cuda/ops/reverse_kernels.cu   |  36 ++++-
 lib/kernels/test/src/test_cast_kernel.cc      |  56 +++++++
 lib/kernels/test/src/test_replicate_kernel.cc |  86 +++++++++++
 lib/kernels/test/src/test_reverse_kernels.cc  | 105 +++++++++++++
 lib/kernels/test/src/test_utils.h             | 138 +++++++++++-------
 12 files changed, 610 insertions(+), 60 deletions(-)
 create mode 100644 lib/kernels/include/kernels/cast_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/replicate_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/reverse_kernels_cpu.h
 create mode 100644 lib/kernels/src/cpu/cast_kernels.cc
 create mode 100644 lib/kernels/src/cpu/combine_kernels.cc
 create mode 100644 lib/kernels/src/cpu/replicate_kernels.cc
 create mode 100644 lib/kernels/src/cpu/reverse_kernels.cc

diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h
new file mode 100644
index 0000000000..df4ef22b93
--- /dev/null
+++ b/lib/kernels/include/kernels/cast_kernels_cpu.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
+
+#include "device.h"
+#include "kernels/accessor.h"
+
+namespace FlexFlow {
+namespace Kernels {
+namespace Cast {
+namespace CPU {
+
+void forward_kernel(GenericTensorAccessorR const &input,
+                    GenericTensorAccessorW const &output,
+                    DataType input_type,
+                    DataType output_type);
+
+void backward_kernel(GenericTensorAccessorR const &input,
+                     GenericTensorAccessorW const &output,
+                     DataType input_type,
+                     DataType output_type);
+
+} // namespace CPU
+} // namespace Cast
+} // namespace Kernels
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h
new file mode 100644
index 0000000000..4bc97f00ef
--- /dev/null
+++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h
@@ -0,0 +1,24 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
+
+#include "device.h"
+#include "kernels/accessor.h"
+
+namespace FlexFlow {
+namespace Kernels {
+namespace Replicate {
+namespace CPU {
+
+void forward_kernel(GenericTensorAccessorR const &input,
+                    GenericTensorAccessorW const &output);
+
+void backward_kernel(GenericTensorAccessorW const &input,
+                     GenericTensorAccessorR const &output,
+                     size_t num_replicas);
+
+} // namespace CPU
+} // namespace Replicate
+} // namespace Kernels
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h
new file mode 100644
index 0000000000..89ed6ffdb4
--- /dev/null
+++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h
@@ -0,0 +1,29 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
+
+#include "device.h"
+
+namespace FlexFlow {
+namespace Kernels {
+namespace Reverse {
+namespace CPU {
+
+void forward_kernel(float const *in_ptr,
+                    float *out_ptr,
+                    coord_t num_out_blks,
+                    coord_t reverse_dim_size,
+                    coord_t in_blk_size,
+                    coord_t output_size);
+
+void backward_kernel(float const *out_grad_ptr,
+                     float *in_grad_ptr,
+                     coord_t num_out_blks,
+                     coord_t reverse_dim_size,
+                     coord_t in_blk_size,
+                     coord_t input_size);
+} // namespace CPU
+} // namespace Reverse
+} // namespace Kernels
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc
new file mode 100644
index 0000000000..cf73a84b93
--- /dev/null
+++ b/lib/kernels/src/cpu/cast_kernels.cc
@@ -0,0 +1,59 @@
+#include "kernels/cast_kernels_cpu.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow {
+namespace Kernels {
+namespace Cast {
+namespace CPU {
+
+template <typename IDT, typename ODT>
+void cast_forward(IDT const *input, ODT *output, size_t volume) {
+  for (size_t i = 0; i < volume; ++i) {
+    output[i] = static_cast<ODT>(input[i]);
+  }
+}
+
+template <typename IDT, typename ODT>
+void cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) {
+  for (size_t i = 0; i < volume; i++) {
+    output[i] = static_cast<ODT>(input[i]) + beta * output[i];
+  }
+}
+
+template <DataType IDT, DataType ODT>
+struct ForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    size_t volume = input.shape.get_volume();
+    cast_forward(input.get<IDT>(), output.get<ODT>(), volume);
+  }
+};
+
+template <DataType IDT, DataType ODT>
+struct BackwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    size_t volume = input.shape.get_volume();
+    cast_backward(
+        input.get<IDT>(), output.get<ODT>(), volume, cast_to<ODT>(1.0f));
+  }
+};
+
+void forward_kernel(GenericTensorAccessorR const &input,
+                    GenericTensorAccessorW const &output,
+                    DataType input_type,
+                    DataType output_type) {
+  DataTypeDispatch2<ForwardKernel>{}(input_type, output_type, input, output);
+}
+
+void backward_kernel(GenericTensorAccessorR const &input,
+                     GenericTensorAccessorW const &output,
+                     DataType input_type,
+                     DataType output_type) {
+  DataTypeDispatch2<BackwardKernel>{}(input_type, output_type, input, output);
+}
+
+} // namespace CPU
+} // namespace Cast
+} // namespace Kernels
+} // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc
new file mode 100644
index 0000000000..5f63d29691
--- /dev/null
+++ b/lib/kernels/src/cpu/replicate_kernels.cc
@@ -0,0 +1,61 @@
+#include "kernels/datatype_dispatch.h"
+#include "kernels/replicate_kernels_cpu.h"
+
+namespace FlexFlow {
+namespace Kernels {
+namespace Replicate {
+namespace CPU {
+
+template <typename T>
+void replicate_backward_kernel(T *input,
+                               T const *output,
+                               size_t num_elements,
+                               size_t num_replicas) {
+  for (size_t i = 0; i < num_elements; ++i) {
+    T sum = 0;
+    for (size_t j = 0; j < num_replicas; ++j) {
+      sum += output[j * num_elements + i];
+    }
+    input[i] = sum;
+  }
+}
+
+// Why does replicate forward seem to only transfer memory? Shouldn't it also
+// handle the replication?
+template <DataType T>
+struct ForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    memcpy(output.get<T>(),
+           input.get<T>(),
+           input.shape.num_elements() * size_of_datatype(T));
+  }
+};
+
+template <DataType T>
+struct BackwardKernel {
+  void operator()(GenericTensorAccessorW const &input,
+                  GenericTensorAccessorR const &output,
+                  size_t num_replicas) {
+    size_t total_elements = input.shape.num_elements() * num_replicas;
+    replicate_backward_kernel(
+        input.get<T>(), output.get<T>(), total_elements, num_replicas);
+  }
+};
+
+void forward_kernel(GenericTensorAccessorR const &input,
+                    GenericTensorAccessorW const &output) {
+  DataTypeDispatch1<ForwardKernel>{}(input.data_type, input, output);
+}
+
+void backward_kernel(GenericTensorAccessorW const &input,
+                     GenericTensorAccessorR const &output,
+                     size_t num_replicas) {
+  DataTypeDispatch1<BackwardKernel>{}(
+      input.data_type, input, output, num_replicas);
+}
+
+} // namespace CPU
+} // namespace Replicate
+} // namespace Kernels
+} // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc
new file mode 100644
index 0000000000..ac8ae26ca2
--- /dev/null
+++ b/lib/kernels/src/cpu/reverse_kernels.cc
@@ -0,0 +1,49 @@
+#include "kernels/reverse_kernels_cpu.h"
+#include <iostream>
+
+namespace FlexFlow {
+namespace Kernels {
+namespace Reverse {
+namespace CPU {
+
+void reverse_forward_kernel(float const *in_ptr,
+                            float *out_ptr,
+                            coord_t num_out_blks,
+                            coord_t reverse_dim_size,
+                            coord_t in_blk_size) {
+  coord_t total_elements = num_out_blks * reverse_dim_size * in_blk_size;
+  for (coord_t i = 0; i < total_elements; ++i) {
+    coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
+    coord_t offset = i - blk_idx * (reverse_dim_size * in_blk_size);
+    coord_t reverse_dim_idx = offset / in_blk_size;
+    coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
+                     (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size +
+                     (offset % in_blk_size);
+    out_ptr[i] = in_ptr[in_idx];
+  }
+}
+
+void forward_kernel(float const *in_ptr,
+                    float *out_ptr,
+                    coord_t num_out_blks,
+                    coord_t reverse_dim_size,
+                    coord_t in_blk_size,
+                    coord_t output_size) {
+  reverse_forward_kernel(
+      in_ptr, out_ptr, num_out_blks, reverse_dim_size, in_blk_size);
+}
+
+void backward_kernel(float const *out_grad_ptr,
+                     float *in_grad_ptr,
+                     coord_t num_out_blks,
+                     coord_t reverse_dim_size,
+                     coord_t in_blk_size,
+                     coord_t input_size) {
+  reverse_forward_kernel(
+      out_grad_ptr, in_grad_ptr, num_out_blks, reverse_dim_size, in_blk_size);
+}
+
+} // namespace CPU
+} // namespace Reverse
+} // namespace Kernels
+} // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu
index 8391a499df..f73c57dedf 100644
--- a/lib/kernels/src/cuda/ops/reverse_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu
@@ -21,6 +21,29 @@ namespace FlexFlow {
 namespace Kernels {
 namespace Reverse {
 
+// __global__ void reverse_forward_kernel(float const *in_ptr,
+//                                        float *out_ptr,
+//                                        coord_t num_out_blks,
+//                                        coord_t reverse_dim_size,
+//                                        coord_t in_blk_size) {
+//   CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
+//     coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
+//     i = i - blk_idx * (reverse_dim_size * in_blk_size);
+//     coord_t reverse_dim_idx = i / in_blk_size;
+//     i = i - reverse_dim_idx * in_blk_size;
+//     coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
+//                      (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size +
+//                      i;
+//     out_ptr[i] = in_ptr[in_idx];
+//   }
+// }
+
+/* I mentioned this earlier, but I still think the reverse_forward_kernel code
+   is incorrect, even though it matches the code in inference/master? Whenever
+   I'm testing the code and printing out the output, I'm getting unexpected
+   outputs, and I think it's a result of modifying the loop index i in the
+   previous code?
+*/
 __global__ void reverse_forward_kernel(float const *in_ptr,
                                        float *out_ptr,
                                        coord_t num_out_blks,
@@ -28,12 +51,13 @@ __global__ void reverse_forward_kernel(float const *in_ptr,
                                        coord_t in_blk_size) {
   CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
     coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
-    i = i - blk_idx * (reverse_dim_size * in_blk_size);
-    coord_t reverse_dim_idx = i / in_blk_size;
-    i = i - reverse_dim_idx * in_blk_size;
-    coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
-                     (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i;
-    out_ptr[i] = in_ptr[in_idx];
+    coord_t idx_within_blk = i % (reverse_dim_size * in_blk_size);
+    coord_t reverse_dim_idx = idx_within_blk / in_blk_size;
+    coord_t in_idx = idx_within_blk % in_blk_size;
+    coord_t input_index =
+        blk_idx * (reverse_dim_size * in_blk_size) +
+        (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + in_idx;
+    out_ptr[i] = in_ptr[input_index];
   }
 }
 
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index 1be5839a9c..b275f7ba83 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -75,4 +75,60 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
     }
   }
+
+  TEST_CASE("Check Cast Forward Kernel against CPU Kernel") {
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims<DataType::FLOAT>({100, 100});
+    TensorShape output_shape =
+        make_tensor_shape_from_legion_dims<DataType::INT32>({100, 100});
+
+    GenericTensorAccessorW output_accessor_gpu =
+        gpu_allocator.allocate_tensor(output_shape);
+    GenericTensorAccessorW output_accessor_cpu =
+        cpu_allocator.allocate_tensor(output_shape);
+
+    // Only calling forward kernel as backward kernel is exactly the same
+    SUBCASE("forward_kernel") {
+      auto transform = [start_val = 1.1f,
+                        counter = 0.0f](float input) mutable -> float {
+        return start_val + counter++;
+      };
+
+      // Run GPU Forward Kernel
+      GenericTensorAccessorW input_accessor_gpu =
+          create_transformed_accessor_w<float, float>(
+              input_shape, gpu_allocator, transform, false);
+      Kernels::Cast::forward_kernel(
+          managed_stream.raw_stream(),
+          read_only_accessor_from_write_accessor(input_accessor_gpu),
+          output_accessor_gpu,
+          DataType::FLOAT,
+          DataType::INT32);
+      std::vector<int32_t> result_data_gpu =
+          load_accessor_data<DataType::INT32>(
+              read_only_accessor_from_write_accessor(output_accessor_gpu),
+              true);
+
+      // Run CPU Forward Kernel
+      GenericTensorAccessorW input_accessor_cpu =
+          create_transformed_accessor_w<float, float>(
+              input_shape, cpu_allocator, transform, true);
+      Kernels::Cast::CPU::forward_kernel(
+          read_only_accessor_from_write_accessor(input_accessor_cpu),
+          output_accessor_cpu,
+          DataType::FLOAT,
+          DataType::INT32);
+      std::vector<int32_t> result_data_cpu =
+          load_accessor_data<DataType::INT32>(
+              read_only_accessor_from_write_accessor(output_accessor_cpu),
+              false);
+
+      CHECK(result_data_gpu == result_data_cpu);
+    }
+  }
 }
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 27223cc7b5..efe17db3f6 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -110,4 +110,90 @@ TEST_SUITE(FF_TEST_SUITE) {
                                 input_grad_accessor_cpu));
     }
   }
+
+  TEST_CASE("Check Replicate Forward Kernel against CPU Kernel") {
+    std::size_t num_replicas = 10;
+
+    // This should be like three shapes: pre_replication, replication shape, and
+    // reduced shape, but things are weird cause doesn't seem to be replicating
+    // anything
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims<DataType::FLOAT>({10, num_replicas});
+    TensorShape replicated_shape =
+        make_tensor_shape_from_legion_dims<DataType::FLOAT>({10, num_replicas});
+    TensorShape reduced_shape =
+        make_tensor_shape_from_legion_dims<DataType::FLOAT>({10});
+
+    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("forward_kernel") {
+      // Run GPU Replicate Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          read_only_accessor_from_write_accessor(
+              create_random_filled_accessor_w(input_shape, gpu_allocator));
+      GenericTensorAccessorW output_accessor_gpu =
+          gpu_allocator.allocate_tensor(replicated_shape);
+
+      Kernels::Replicate::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(output_accessor_gpu), true);
+
+      // Run CPU Replicate Forward Kernel
+      GenericTensorAccessorW input_accessor_cpu =
+          copy_tensor_between_memories<DataType::FLOAT>(
+              input_accessor_gpu, input_shape, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          cpu_allocator.allocate_tensor(replicated_shape);
+
+      Kernels::Replicate::CPU::forward_kernel(
+          read_only_accessor_from_write_accessor(input_accessor_cpu),
+          output_accessor_cpu);
+
+      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(output_accessor_cpu), false);
+
+      CHECK(result_data_gpu == result_data_cpu);
+    }
+
+    SUBCASE("backward_kernel") {
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          read_only_accessor_from_write_accessor(
+              create_random_filled_accessor_w(replicated_shape, gpu_allocator));
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          gpu_allocator.allocate_tensor(reduced_shape);
+
+      Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
+                                          input_grad_accessor_gpu,
+                                          output_grad_accessor_gpu,
+                                          num_replicas);
+
+      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(input_grad_accessor_gpu),
+          true);
+
+      GenericTensorAccessorW output_grad_accessor_cpu =
+          copy_tensor_between_memories<DataType::FLOAT>(
+              output_grad_accessor_gpu, replicated_shape, cpu_allocator);
+
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          cpu_allocator.allocate_tensor(reduced_shape);
+
+      Kernels::Replicate::CPU::backward_kernel(
+          input_grad_accessor_cpu,
+          read_only_accessor_from_write_accessor(output_grad_accessor_cpu),
+          num_replicas);
+
+      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(input_grad_accessor_cpu),
+          false);
+
+      CHECK(result_data_gpu == result_data_cpu);
+    }
+  }
 }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index 4adf79847a..e46f6b6dcb 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -136,4 +136,109 @@ TEST_SUITE(FF_TEST_SUITE) {
                                 input_grad_accessor_cpu));
     }
   }
+
+  TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") {
+    std::size_t num_out_blks = 2;
+    std::size_t reverse_dim_size = 3;
+    std::size_t in_blk_size = 5;
+
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims<DataType::FLOAT>(
+            {num_out_blks, reverse_dim_size, in_blk_size});
+    TensorShape output_shape = input_shape;
+
+    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("forward_kernel") {
+      auto transform = [counter = 0.0f](float val) mutable {
+        return counter++;
+      };
+
+      // Run GPU Cast Forward Kernel
+      GenericTensorAccessorW input_accessor_gpu =
+          create_transformed_accessor_w<float, float>(
+              input_shape, gpu_allocator, transform, false);
+      GenericTensorAccessorW output_accessor_gpu =
+          gpu_allocator.allocate_tensor(output_shape);
+
+      Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
+                                       input_accessor_gpu.get_float_ptr(),
+                                       output_accessor_gpu.get_float_ptr(),
+                                       num_out_blks,
+                                       reverse_dim_size,
+                                       in_blk_size,
+                                       input_accessor_gpu.shape.num_elements());
+
+      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(output_accessor_gpu), true);
+
+      // Run CPU Cast Forward Kernel
+      GenericTensorAccessorW input_accessor_cpu =
+          create_transformed_accessor_w<float, float>(
+              input_shape, cpu_allocator, transform, true);
+      GenericTensorAccessorW output_accessor_cpu =
+          cpu_allocator.allocate_tensor(output_shape);
+
+      Kernels::Reverse::CPU::forward_kernel(
+          input_accessor_cpu.get_float_ptr(),
+          output_accessor_cpu.get_float_ptr(),
+          num_out_blks,
+          reverse_dim_size,
+          in_blk_size,
+          input_accessor_cpu.shape.num_elements());
+
+      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(output_accessor_cpu), false);
+
+      CHECK(result_data_gpu == result_data_cpu);
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Cast Backward Kernel
+      GenericTensorAccessorW output_grad_accessor_gpu =
+          create_random_filled_accessor_w(output_shape, gpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          gpu_allocator.allocate_tensor(input_shape);
+
+      Kernels::Reverse::backward_kernel(
+          managed_stream.raw_stream(),
+          output_grad_accessor_gpu.get_float_ptr(),
+          input_grad_accessor_gpu.get_float_ptr(),
+          num_out_blks,
+          reverse_dim_size,
+          in_blk_size,
+          input_grad_accessor_gpu.shape.num_elements());
+
+      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(input_grad_accessor_gpu),
+          true);
+
+      // Run CPU Cast Backward Kernel
+      GenericTensorAccessorW output_grad_accessor_cpu =
+          copy_tensor_between_memories<DataType::FLOAT>(
+              read_only_accessor_from_write_accessor(output_grad_accessor_gpu),
+              output_shape,
+              cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          cpu_allocator.allocate_tensor(input_shape);
+
+      Kernels::Reverse::CPU::backward_kernel(
+          output_grad_accessor_cpu.get_float_ptr(),
+          input_grad_accessor_cpu.get_float_ptr(),
+          num_out_blks,
+          reverse_dim_size,
+          in_blk_size,
+          input_grad_accessor_cpu.shape.num_elements());
+
+      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(input_grad_accessor_cpu),
+          false);
+
+      CHECK(result_data_gpu == result_data_cpu);
+    }
+  }
 }
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
index 8a063fea17..5638b837b1 100644
--- a/lib/kernels/test/src/test_utils.h
+++ b/lib/kernels/test/src/test_utils.h
@@ -14,78 +14,99 @@
 
 using namespace FlexFlow;
 
-template <typename DT>
-GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
-                                                       Allocator &allocator,
-                                                       bool cpu_fill = false) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements();
-  std::vector<DT> host_data(volume);
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_real_distribution<DT> dist(-1.0f, 1.0f);
-
-  for (auto &val : host_data) {
-    val = dist(gen);
-  }
+enum class GpuDirection {
+  HostToDevice = 0,
+  DeviceToHost = 1,
+  DeviceToDevice = 2
+};
 
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT));
+template <typename DT>
+void transfer_memory(DT *dst,
+                     const DT *src,
+                     size_t num_elements,
+                     GpuDirection gpu_dir,
+                     bool cpu_memory) {
+  size_t bytes = num_elements * sizeof(DT);
+
+  if (cpu_memory) {
+    memcpy(dst, src, bytes);
   } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(DT),
-                         cudaMemcpyHostToDevice));
+    switch (gpu_dir) {
+      case GpuDirection::HostToDevice:
+        checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice));
+        break;
+      case GpuDirection::DeviceToHost:
+        checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost));
+        break;
+      case GpuDirection::DeviceToDevice:
+        checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
+        break;
+    }
   }
-
-  return accessor;
 }
 
+GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
+                                                       Allocator &allocator,
+                                                       bool on_host = false);
+
 template <typename DT>
 GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
                                                 Allocator &allocator,
                                                 DT val,
-                                                bool cpu_fill = false) {
+                                                bool on_host = false) {
   GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
   size_t volume = accessor.shape.num_elements();
   std::vector<DT> host_data(volume, val);
 
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(DT),
-                         cudaMemcpyHostToDevice));
-  }
+  transfer_memory(static_cast<DT *>(accessor.ptr),
+                  host_data.data(),
+                  volume,
+                  GpuDirection::HostToDevice,
+                  on_host);
 
   return accessor;
 }
 
-template <typename DT>
-GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape,
+template <typename IDT, typename ODT, typename F>
+GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape,
                                                      Allocator &allocator,
-                                                     bool cpu_fill = false) {
+                                                     F transform,
+                                                     bool on_host = false) {
   GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements();
-  std::vector<DT> host_data(volume);
+  size_t volume = accessor.shape.get_volume();
+  std::vector<IDT> input_data(volume);
+  std::vector<ODT> output_data(volume);
 
-  for (size_t i = 0; i < volume; i++) {
-    host_data[i] = i;
-  }
+  std::transform(
+      input_data.begin(), input_data.end(), output_data.begin(), transform);
 
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(DT),
-                         cudaMemcpyHostToDevice));
-  }
+  transfer_memory(static_cast<ODT *>(accessor.ptr),
+                  output_data.data(),
+                  volume,
+                  GpuDirection::HostToDevice,
+                  on_host);
 
   return accessor;
 }
 
+template <DataType DT>
+GenericTensorAccessorW
+    copy_tensor_between_memories(GenericTensorAccessorR accessor,
+                                 TensorShape const &shape,
+                                 Allocator &allocator,
+                                 bool src_on_host = false) {
+  GenericTensorAccessorW copied_accessor = allocator.allocate_tensor(shape);
+
+  size_t volume = accessor.shape.get_volume();
+  GpuDirection gpu_dir =
+      src_on_host ? GpuDirection::HostToDevice : GpuDirection::DeviceToHost;
+
+  transfer_memory(
+      copied_accessor.get<DT>(), accessor.get<DT>(), volume, gpu_dir, false);
+
+  return copied_accessor;
+}
+
 template <DataType DT>
 TensorShape make_tensor_shape_from_legion_dims(FFOrdered<size_t> dims) {
   return TensorShape{
@@ -96,15 +117,24 @@ TensorShape make_tensor_shape_from_legion_dims(FFOrdered<size_t> dims) {
   };
 }
 
-template <typename DT>
-std::vector<DT> load_data_to_host_from_device(GenericTensorAccessorR accessor) {
+template <DataType DT>
+std::vector<real_type<DT>> load_accessor_data(GenericTensorAccessorR accessor,
+                                              bool on_device = true) {
   int volume = accessor.shape.get_volume();
 
-  std::vector<DT> local_data(volume);
-  checkCUDA(cudaMemcpy(local_data.data(),
-                       accessor.ptr,
-                       local_data.size() * sizeof(DT),
-                       cudaMemcpyDeviceToHost));
+  using T = real_type<DT>;
+  std::vector<T> local_data(volume);
+  T const *src_ptr = accessor.get<DT>();
+
+  if (on_device) {
+    checkCUDA(cudaMemcpy(local_data.data(),
+                         src_ptr,
+                         volume * sizeof(T),
+                         cudaMemcpyDeviceToHost));
+  } else {
+    std::memcpy(local_data.data(), src_ptr, volume * sizeof(T));
+  }
+
   return local_data;
 }
 

From 921fe6568cb6a415f22fa878c5759bc10eefbe57 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Sun, 14 Jul 2024 15:45:59 -0700
Subject: [PATCH 03/42] combine kernel

---
 lib/kernels/include/kernels/cast_kernels.h    |  2 -
 lib/kernels/src/cpu/combine_kernels.cc        | 44 +++++++++++++++++++
 lib/kernels/src/cpu/replicate_kernels.cc      |  2 +-
 lib/kernels/src/cpu/reverse_kernels.cc        |  1 -
 lib/kernels/src/local_cpu_allocator.cc        |  4 +-
 lib/kernels/src/local_cuda_allocator.cc       |  1 +
 lib/kernels/test/src/test_cast_kernel.cc      |  4 +-
 lib/kernels/test/src/test_replicate_kernel.cc | 13 +++---
 lib/kernels/test/src/test_reverse_kernels.cc  |  8 ++--
 lib/kernels/test/src/test_utils.h             | 12 ++---
 10 files changed, 65 insertions(+), 26 deletions(-)

diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h
index 96f9aadd52..502a823ca7 100644
--- a/lib/kernels/include/kernels/cast_kernels.h
+++ b/lib/kernels/include/kernels/cast_kernels.h
@@ -3,8 +3,6 @@
 
 #include "device.h"
 #include "kernels/accessor.h"
-#include "kernels/ff_handle.h"
-#include "op-attrs/activation.dtg.h"
 
 namespace FlexFlow {
 namespace Kernels {
diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc
index e69de29bb2..f1950a56d2 100644
--- a/lib/kernels/src/cpu/combine_kernels.cc
+++ b/lib/kernels/src/cpu/combine_kernels.cc
@@ -0,0 +1,44 @@
+#include "kernels/combine_kernels_cpu.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow {
+namespace Kernels {
+namespace Combine {
+namespace CPU {
+
+template <DataType DT>
+struct ForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    memcpy(output.get<DT>(),
+           input.get<DT>(),
+           input.shape.get_volume() * size_of_datatype(DT));
+  }
+};
+
+template <DataType DT>
+struct BackwardKernel {
+  void operator()(GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorW const &input_grad) {
+    size_t num_elements = output_grad.shape.get_volume();
+    for (int i = 0; i < num_elements; ++i) {
+      input_grad.get<DT>()[i] += output_grad.get<DT>()[i];
+    }
+  }
+};
+
+void forward_kernel(GenericTensorAccessorR const &input,
+                    GenericTensorAccessorW const &output) {
+  DataTypeDispatch1<ForwardKernel>{}(input.data_type, input, output);
+}
+
+void backward_kernel(GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorW const &input_grad) {
+  DataTypeDispatch1<BackwardKernel>{}(
+      input_grad.data_type, output_grad, input_grad);
+}
+
+} // namespace CPU
+} // namespace Combine
+} // namespace Kernels
+} // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc
index 5f63d29691..a26d2054d1 100644
--- a/lib/kernels/src/cpu/replicate_kernels.cc
+++ b/lib/kernels/src/cpu/replicate_kernels.cc
@@ -14,7 +14,7 @@ void replicate_backward_kernel(T *input,
   for (size_t i = 0; i < num_elements; ++i) {
     T sum = 0;
     for (size_t j = 0; j < num_replicas; ++j) {
-      sum += output[j * num_elements + i];
+      sum += output[i + j * num_elements];
     }
     input[i] = sum;
   }
diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc
index ac8ae26ca2..b035f03721 100644
--- a/lib/kernels/src/cpu/reverse_kernels.cc
+++ b/lib/kernels/src/cpu/reverse_kernels.cc
@@ -1,5 +1,4 @@
 #include "kernels/reverse_kernels_cpu.h"
-#include <iostream>
 
 namespace FlexFlow {
 namespace Kernels {
diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc
index 6553dc2f88..9cc86c44ca 100644
--- a/lib/kernels/src/local_cpu_allocator.cc
+++ b/lib/kernels/src/local_cpu_allocator.cc
@@ -3,12 +3,14 @@
 
 namespace FlexFlow {
 void *LocalCPUAllocator::allocate(size_t requested_memory_size) {
-  void *ptr = malloc(requested_memory_size);
+  void *ptr = calloc(1, requested_memory_size);
+
   if (ptr != nullptr) {
     this->ptrs.insert(ptr);
   } else {
     throw std::bad_alloc();
   }
+
   return ptr;
 }
 
diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc
index cdcfb017a0..dad101c64c 100644
--- a/lib/kernels/src/local_cuda_allocator.cc
+++ b/lib/kernels/src/local_cuda_allocator.cc
@@ -6,6 +6,7 @@ namespace FlexFlow {
 void *LocalCudaAllocator::allocate(size_t requested_memory_size) {
   void *ptr;
   checkCUDA(cudaMalloc(&ptr, requested_memory_size));
+  checkCUDA(cudaMemset(ptr, 0, requested_memory_size));
   this->ptrs.insert(ptr);
   return ptr;
 }
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index b275f7ba83..b427b493b8 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -112,7 +112,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<int32_t> result_data_gpu =
           load_accessor_data<DataType::INT32>(
               read_only_accessor_from_write_accessor(output_accessor_gpu),
-              true);
+              false);
 
       // Run CPU Forward Kernel
       GenericTensorAccessorW input_accessor_cpu =
@@ -126,7 +126,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<int32_t> result_data_cpu =
           load_accessor_data<DataType::INT32>(
               read_only_accessor_from_write_accessor(output_accessor_cpu),
-              false);
+              true);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index efe17db3f6..65f02f4bc9 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -116,7 +116,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     // This should be like three shapes: pre_replication, replication shape, and
     // reduced shape, but things are weird cause doesn't seem to be replicating
-    // anything
+    // anything (ie. input shape should be same as reduced shape)
     TensorShape input_shape =
         make_tensor_shape_from_legion_dims<DataType::FLOAT>({10, num_replicas});
     TensorShape replicated_shape =
@@ -142,7 +142,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
 
       std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_gpu), true);
+          read_only_accessor_from_write_accessor(output_accessor_gpu), false);
 
       // Run CPU Replicate Forward Kernel
       GenericTensorAccessorW input_accessor_cpu =
@@ -156,12 +156,13 @@ TEST_SUITE(FF_TEST_SUITE) {
           output_accessor_cpu);
 
       std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_cpu), false);
+          read_only_accessor_from_write_accessor(output_accessor_cpu), true);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
 
     SUBCASE("backward_kernel") {
+      // Run GPU Replicate Backward Kernel
       GenericTensorAccessorR output_grad_accessor_gpu =
           read_only_accessor_from_write_accessor(
               create_random_filled_accessor_w(replicated_shape, gpu_allocator));
@@ -175,12 +176,12 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
           read_only_accessor_from_write_accessor(input_grad_accessor_gpu),
-          true);
+          false);
 
+      // Run CPU Replicate Backward Kernel
       GenericTensorAccessorW output_grad_accessor_cpu =
           copy_tensor_between_memories<DataType::FLOAT>(
               output_grad_accessor_gpu, replicated_shape, cpu_allocator);
-
       GenericTensorAccessorW input_grad_accessor_cpu =
           cpu_allocator.allocate_tensor(reduced_shape);
 
@@ -191,7 +192,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
           read_only_accessor_from_write_accessor(input_grad_accessor_cpu),
-          false);
+          true);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index e46f6b6dcb..f37bbba941 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -174,7 +174,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        input_accessor_gpu.shape.num_elements());
 
       std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_gpu), true);
+          read_only_accessor_from_write_accessor(output_accessor_gpu), false);
 
       // Run CPU Cast Forward Kernel
       GenericTensorAccessorW input_accessor_cpu =
@@ -192,7 +192,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           input_accessor_cpu.shape.num_elements());
 
       std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_cpu), false);
+          read_only_accessor_from_write_accessor(output_accessor_cpu), true);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
@@ -215,7 +215,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
           read_only_accessor_from_write_accessor(input_grad_accessor_gpu),
-          true);
+          false);
 
       // Run CPU Cast Backward Kernel
       GenericTensorAccessorW output_grad_accessor_cpu =
@@ -236,7 +236,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
           read_only_accessor_from_write_accessor(input_grad_accessor_cpu),
-          false);
+          true);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
index 5638b837b1..80720801b6 100644
--- a/lib/kernels/test/src/test_utils.h
+++ b/lib/kernels/test/src/test_utils.h
@@ -119,21 +119,15 @@ TensorShape make_tensor_shape_from_legion_dims(FFOrdered<size_t> dims) {
 
 template <DataType DT>
 std::vector<real_type<DT>> load_accessor_data(GenericTensorAccessorR accessor,
-                                              bool on_device = true) {
+                                              bool on_host = false) {
   int volume = accessor.shape.get_volume();
 
   using T = real_type<DT>;
   std::vector<T> local_data(volume);
   T const *src_ptr = accessor.get<DT>();
 
-  if (on_device) {
-    checkCUDA(cudaMemcpy(local_data.data(),
-                         src_ptr,
-                         volume * sizeof(T),
-                         cudaMemcpyDeviceToHost));
-  } else {
-    std::memcpy(local_data.data(), src_ptr, volume * sizeof(T));
-  }
+  transfer_memory(
+      local_data.data(), src_ptr, volume, GpuDirection::DeviceToHost, on_host);
 
   return local_data;
 }

From 4ca67aa7549d00a0aff3e745ff242e1bea47d3e4 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Sun, 14 Jul 2024 15:58:40 -0700
Subject: [PATCH 04/42] combine kernels .h file

---
 .../include/kernels/combine_kernels_cpu.h     | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 lib/kernels/include/kernels/combine_kernels_cpu.h

diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h
new file mode 100644
index 0000000000..1d30297af1
--- /dev/null
+++ b/lib/kernels/include/kernels/combine_kernels_cpu.h
@@ -0,0 +1,23 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
+
+#include "device.h"
+#include "kernels/accessor.h"
+
+namespace FlexFlow {
+namespace Kernels {
+namespace Combine {
+namespace CPU {
+
+void forward_kernel(GenericTensorAccessorR const &input,
+                    GenericTensorAccessorW const &output);
+
+void backward_kernel(GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorW const &input_grad);
+
+} // namespace CPU
+} // namespace Combine
+} // namespace Kernels
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H

From 86edf2e4aa0d2a34d7135988efca1ad3dff10826 Mon Sep 17 00:00:00 2001
From: Marsella8 <45826022+Marsella8@users.noreply.github.com>
Date: Thu, 18 Jul 2024 17:38:22 -0700
Subject: [PATCH 05/42] Implementations for methods for machine_views and
 associated modules  (#1429)

* initial commit for machine view adjacent modules

* Formatting

* Tests for new machine_view.cc functions

* formatting

* Minor Test correction

* formatting

* PR fixes

* PR Fixes

---------

Co-authored-by: Pietro Max Marsella <marsella@stanford.edu>
---
 lib/pcg/include/pcg/strided_rectangle.h    | 17 +++++
 lib/pcg/src/pcg/strided_rectangle_side.cc  | 17 +++++
 lib/pcg/src/strided_rectangle.cc           | 35 ++++++++++
 lib/pcg/test/src/test_machine_view.cc      | 74 ++++++++++++++++++++++
 lib/pcg/test/src/test_strided_rectangle.cc | 37 +++++++++++
 5 files changed, 180 insertions(+)
 create mode 100644 lib/pcg/include/pcg/strided_rectangle.h
 create mode 100644 lib/pcg/src/pcg/strided_rectangle_side.cc
 create mode 100644 lib/pcg/src/strided_rectangle.cc
 create mode 100644 lib/pcg/test/src/test_machine_view.cc
 create mode 100644 lib/pcg/test/src/test_strided_rectangle.cc

diff --git a/lib/pcg/include/pcg/strided_rectangle.h b/lib/pcg/include/pcg/strided_rectangle.h
new file mode 100644
index 0000000000..9c3b8eeda9
--- /dev/null
+++ b/lib/pcg/include/pcg/strided_rectangle.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_PCG_INCLUDE_PCG_STRIDED_RECTANGLE_H
+#define _FLEXFLOW_PCG_INCLUDE_PCG_STRIDED_RECTANGLE_H
+
+#include "op-attrs/ff_dim.dtg.h"
+#include "pcg/side_size_t.dtg.h"
+#include "pcg/strided_rectangle.dtg.h"
+
+namespace FlexFlow {
+
+size_t get_num_dims(StridedRectangle const &);
+StridedRectangleSide get_side_at_idx(StridedRectangle const &rect,
+                                     ff_dim_t const &idx);
+num_points_t get_num_points(StridedRectangle const &rect);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/pcg/src/pcg/strided_rectangle_side.cc b/lib/pcg/src/pcg/strided_rectangle_side.cc
new file mode 100644
index 0000000000..e6caf4cb86
--- /dev/null
+++ b/lib/pcg/src/pcg/strided_rectangle_side.cc
@@ -0,0 +1,17 @@
+#include "pcg/strided_rectangle_side.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+StridedRectangleSide strided_side_from_size_and_stride(side_size_t side_size,
+                                                       int stride) {
+  assert((side_size.unwrapped % stride) == 0);
+  return StridedRectangleSide{num_points_t{side_size.unwrapped / stride},
+                              stride};
+}
+
+side_size_t get_side_size(StridedRectangleSide const &s) {
+  return side_size_t{s.num_points.unwrapped * s.stride};
+}
+
+} // namespace FlexFlow
diff --git a/lib/pcg/src/strided_rectangle.cc b/lib/pcg/src/strided_rectangle.cc
new file mode 100644
index 0000000000..1c61424ab9
--- /dev/null
+++ b/lib/pcg/src/strided_rectangle.cc
@@ -0,0 +1,35 @@
+#include "pcg/strided_rectangle.h"
+#include "op-attrs/dim_ordered/transform.h"
+#include "utils/containers.h"
+
+namespace FlexFlow {
+
+/* size_t StridedRectangle::at(FFOrdered<num_points_t> const &coord) const { */
+/*   assert(coord.size() == this->num_dims()); */
+
+/*   size_t _1d_stride = 1; */
+/*   size_t idx = 0; */
+/*   for (auto dim : inner_to_outer_idxs(this->sides)) { */
+/*     idx += this->sides.at(dim).at(coord.at(dim)).value() * _1d_stride; */
+/*     _1d_stride *= this->sides.at(dim).get_size().value(); */
+/*   } */
+/*   return idx; */
+/* } */
+
+size_t get_num_dims(StridedRectangle const &rect) {
+  return rect.sides.size();
+}
+
+num_points_t get_num_points(StridedRectangle const &rect) {
+  return num_points_t{
+      product(transform(rect.sides, [](StridedRectangleSide const &side) {
+        return side.num_points.unwrapped;
+      }))};
+}
+
+StridedRectangleSide get_side_at_idx(StridedRectangle const &rect,
+                                     ff_dim_t const &idx) {
+  return rect.sides.at(idx);
+}
+
+} // namespace FlexFlow
diff --git a/lib/pcg/test/src/test_machine_view.cc b/lib/pcg/test/src/test_machine_view.cc
new file mode 100644
index 0000000000..92a96d5e9a
--- /dev/null
+++ b/lib/pcg/test/src/test_machine_view.cc
@@ -0,0 +1,74 @@
+#include "doctest/doctest.h"
+#include "pcg/machine_view.h"
+#include "pcg/strided_rectangle.h"
+#include "pcg/strided_rectangle_side.h"
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("MachineView general util functions") {
+    StridedRectangle rect{{StridedRectangleSide{num_points_t{7}, 5},
+                           StridedRectangleSide{num_points_t{10}, 2}}};
+    gpu_id_t start(1);
+    MachineView mv{device_id_t{start}, rect};
+    SUBCASE("num_dims") {
+      CHECK(num_dims(mv) == 2);
+    }
+    SUBCASE("num_devices") {
+      CHECK(num_devices(mv) == 7 * 10);
+    }
+    SUBCASE("get_device_type") {
+      CHECK(get_device_type(mv) == DeviceType::GPU);
+    }
+  }
+
+  TEST_CASE("MachineView make_1d_machine_view - GPU") {
+    StridedRectangle rect{{StridedRectangleSide{num_points_t{7}, 5}}};
+    device_id_t start_gpu{gpu_id_t{1}};
+    MachineView gpu_mv{start_gpu, rect};
+
+    SUBCASE("make_1d_machine_view(gpu_id_t start, gpu_id_t stop, int stride)") {
+      MachineView result =
+          make_1d_machine_view(start_gpu, device_id_t{gpu_id_t(1 + 7 * 5)}, 5);
+      MachineView correct = gpu_mv;
+      CHECK(result == correct);
+    }
+    SUBCASE("make_1d_machine_view(gpu_id_t start, num_points_t num_points, int "
+            "stride)") {
+      MachineView result = make_1d_machine_view(start_gpu, num_points_t{7}, 5);
+      MachineView correct = gpu_mv;
+      CHECK(result == correct);
+    }
+    SUBCASE("make_1d_machine_view(gpu_id_t start, side_size_t interval_size, "
+            "int stride)") {
+      MachineView result = make_1d_machine_view(
+          start_gpu, get_side_size(rect.sides.at(ff_dim_t{0})), 5);
+      MachineView correct = gpu_mv;
+      CHECK(result == correct);
+    }
+  }
+
+  TEST_CASE("MachineView make_1d_machine_view - CPU") {
+    StridedRectangle rect{{StridedRectangleSide{num_points_t{11}, 4}}};
+    device_id_t start_cpu{cpu_id_t{2}};
+    MachineView cpu_mv{start_cpu, rect};
+
+    SUBCASE("make_1d_machine_view(cpu_id_t start, cpu_id_t stop, int stride)") {
+      MachineView result =
+          make_1d_machine_view(start_cpu, device_id_t{cpu_id_t(2 + 11 * 4)}, 4);
+      MachineView correct = cpu_mv;
+      CHECK(result == correct);
+    }
+    SUBCASE("make_1d_machine_view(cpu_id_t start, num_points_t num_points, int "
+            "stride)") {
+      MachineView result = make_1d_machine_view(start_cpu, num_points_t{11}, 4);
+      MachineView correct = cpu_mv;
+      CHECK(result == correct);
+    }
+    SUBCASE("make_1d_machine_view(cpu_id_t start, side_size_t interval_size, "
+            "int stride)") {
+      MachineView result = make_1d_machine_view(
+          start_cpu, get_side_size(rect.sides.at(ff_dim_t{0})), 4);
+      MachineView correct = cpu_mv;
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/pcg/test/src/test_strided_rectangle.cc b/lib/pcg/test/src/test_strided_rectangle.cc
new file mode 100644
index 0000000000..ef342944de
--- /dev/null
+++ b/lib/pcg/test/src/test_strided_rectangle.cc
@@ -0,0 +1,37 @@
+#include "doctest/doctest.h"
+#include "pcg/strided_rectangle.h"
+#include "pcg/strided_rectangle_side.h"
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("get_side_size(StridedRectangleSide)") {
+    StridedRectangleSide side{num_points_t{7}, 5};
+
+    CHECK(get_side_size(side) == side_size_t{7 * 5});
+  }
+  TEST_CASE("strided_side_from_size_and_stride") {
+    StridedRectangleSide correct{num_points_t{10}, 3};
+    StridedRectangleSide result =
+        strided_side_from_size_and_stride(side_size_t{10 * 3}, 3);
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("StridedRectangle - helper functions") {
+
+    StridedRectangleSide s0{num_points_t{7}, 5};
+    StridedRectangleSide s1{num_points_t{10}, 2};
+    StridedRectangleSide s2{num_points_t{8}, 1};
+    StridedRectangle rect{{s0, s1, s2}};
+
+    SUBCASE("get_num_dims") {
+      CHECK(get_num_dims(rect) == 3);
+    }
+    SUBCASE("get_num_points") {
+      CHECK(get_num_points(rect) == num_points_t{7 * 8 * 10});
+    }
+    SUBCASE("get_side_at_idx") {
+      CHECK(get_side_at_idx(rect, ff_dim_t{0}) == s0);
+      CHECK(get_side_at_idx(rect, ff_dim_t{1}) == s1);
+      CHECK(get_side_at_idx(rect, ff_dim_t{2}) == s2);
+    }
+  }
+}

From d9af610c5f940b1c06455fb938d7e589abaf712b Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Wed, 31 Jul 2024 04:49:13 -0700
Subject: [PATCH 06/42] test utils logic cleanup, reverse cpu_kernel
 pedagogical implmentation, other minor fixes

---
 lib/kernels/include/kernels/accessor.h        |  26 ++-
 lib/kernels/include/kernels/allocation.h      |   9 +
 .../include/kernels/cast_kernels_cpu.h        |  18 +-
 .../include/kernels/combine_kernels_cpu.h     |  10 +-
 .../include/kernels/local_cpu_allocator.h     |   1 +
 .../include/kernels/local_cuda_allocator.h    |   1 +
 .../include/kernels/replicate_kernels_cpu.h   |  12 +-
 .../include/kernels/reverse_kernels_cpu.h     |  26 ++-
 lib/kernels/src/accessor.cc                   |   6 +-
 lib/kernels/src/allocation.cc                 |  14 +-
 lib/kernels/src/cpu/cast_kernels.cc           |  35 ++--
 lib/kernels/src/cpu/combine_kernels.cc        |  18 +-
 lib/kernels/src/cpu/replicate_kernels.cc      |  30 ++--
 lib/kernels/src/cpu/reverse_kernels.cc        |  88 ++++++----
 lib/kernels/src/local_cpu_allocator.cc        |  18 +-
 lib/kernels/src/local_cuda_allocator.cc       |  11 +-
 lib/kernels/test/src/test_attention_kernel.cc |  26 ++-
 .../test/src/test_batch_matmul_kernel.cc      |  12 +-
 .../test/src/test_batch_norm_kernel.cc        |  15 +-
 lib/kernels/test/src/test_cast_kernel.cc      |  25 +--
 lib/kernels/test/src/test_dropout.cc          |   6 +-
 lib/kernels/test/src/test_gather_kernels.cc   |   3 +-
 .../test/src/test_layer_norm_kernels.cc       |   3 +-
 lib/kernels/test/src/test_pool_2d_kernels.cc  |   6 +-
 lib/kernels/test/src/test_replicate_kernel.cc |  46 +++--
 lib/kernels/test/src/test_reverse_kernels.cc  |  38 ++---
 lib/kernels/test/src/test_softmax_kernel.cc   |   6 +-
 lib/kernels/test/src/test_split_kernel.cc     |   6 +-
 lib/kernels/test/src/test_transpose_kernel.cc |   3 +-
 lib/kernels/test/src/test_utils.h             | 160 +++++++++++-------
 .../local-execution/tracked_allocator.h       |   1 +
 lib/local-execution/src/tracked_allocator.cc  |  10 +-
 32 files changed, 417 insertions(+), 272 deletions(-)

diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index 39da65c3be..e30e1fe825 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -28,15 +28,20 @@ class GenericTensorAccessorW {
   double *get_double_ptr() const;
   half *get_half_ptr() const;
 
+  GenericTensorAccessorW(DataType dt,
+                         ArrayShape sh,
+                         req<void *> p,
+                         bool on_dev = true)
+      : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {}
+
 public:
   DataType data_type;
   ArrayShape shape;
   req<void *> ptr;
+  bool on_device;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW,
-                                             data_type,
-                                             shape,
-                                             ptr);
+FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(
+    GenericTensorAccessorW, data_type, shape, ptr, on_device);
 
 std::string format_as(GenericTensorAccessorW const &);
 std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
@@ -59,15 +64,20 @@ class GenericTensorAccessorR {
   double const *get_double_ptr() const;
   half const *get_half_ptr() const;
 
+  GenericTensorAccessorR(DataType dt,
+                         ArrayShape sh,
+                         req<void const *> p,
+                         bool on_dev = true)
+      : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {}
+
 public:
   DataType data_type;
   ArrayShape shape;
   req<void const *> ptr;
+  bool on_device;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR,
-                                             data_type,
-                                             shape,
-                                             ptr);
+FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(
+    GenericTensorAccessorR, data_type, shape, ptr, on_device);
 
 std::string format_as(GenericTensorAccessorR const &);
 std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h
index 6500899394..452ccc47b0 100644
--- a/lib/kernels/include/kernels/allocation.h
+++ b/lib/kernels/include/kernels/allocation.h
@@ -5,10 +5,13 @@
 #include <cstddef>
 #include <memory>
 
+enum class AllocLocation { HOST, DEVICE };
+
 namespace FlexFlow {
 
 struct IAllocator {
   virtual void *allocate(size_t) = 0;
+  virtual void *allocate_and_zero(size_t) = 0;
   virtual void deallocate(void *) = 0;
 
   virtual ~IAllocator() = default;
@@ -18,7 +21,11 @@ struct Allocator {
   Allocator() = delete;
 
   GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape);
+  GenericTensorAccessorW
+      allocate_tensor_and_zero(TensorShape const &tensor_shape);
+
   void *allocate(size_t mem_size);
+  void *allocate_and_zero(size_t mem_size);
   void deallocate(void *ptr);
 
   template <typename T, typename... Args>
@@ -30,6 +37,8 @@ struct Allocator {
 
   Allocator(std::shared_ptr<IAllocator> ptr) : i_allocator(ptr){};
 
+  AllocLocation alloc_location;
+
 private:
   std::shared_ptr<IAllocator> i_allocator;
 };
diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h
index df4ef22b93..cae0c9da8d 100644
--- a/lib/kernels/include/kernels/cast_kernels_cpu.h
+++ b/lib/kernels/include/kernels/cast_kernels_cpu.h
@@ -7,19 +7,17 @@
 namespace FlexFlow {
 namespace Kernels {
 namespace Cast {
-namespace CPU {
 
-void forward_kernel(GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    DataType input_type,
-                    DataType output_type);
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output,
+                        DataType input_type,
+                        DataType output_type);
 
-void backward_kernel(GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output,
-                     DataType input_type,
-                     DataType output_type);
+void cpu_backward_kernel(GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &output,
+                         DataType input_type,
+                         DataType output_type);
 
-} // namespace CPU
 } // namespace Cast
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h
index 1d30297af1..66c22ddbf8 100644
--- a/lib/kernels/include/kernels/combine_kernels_cpu.h
+++ b/lib/kernels/include/kernels/combine_kernels_cpu.h
@@ -7,15 +7,13 @@
 namespace FlexFlow {
 namespace Kernels {
 namespace Combine {
-namespace CPU {
 
-void forward_kernel(GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output);
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
 
-void backward_kernel(GenericTensorAccessorR const &output_grad,
-                     GenericTensorAccessorW const &input_grad);
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad);
 
-} // namespace CPU
 } // namespace Combine
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h
index 27dcc9d854..121ed184e9 100644
--- a/lib/kernels/include/kernels/local_cpu_allocator.h
+++ b/lib/kernels/include/kernels/local_cpu_allocator.h
@@ -10,6 +10,7 @@ struct LocalCPUAllocator : public IAllocator {
   ~LocalCPUAllocator() override;
 
   void *allocate(size_t) override;
+  void *allocate_and_zero(size_t) override;
   void deallocate(void *) override;
 
 private:
diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h
index 18a4b6e78a..16f60daead 100644
--- a/lib/kernels/include/kernels/local_cuda_allocator.h
+++ b/lib/kernels/include/kernels/local_cuda_allocator.h
@@ -10,6 +10,7 @@ struct LocalCudaAllocator : public IAllocator {
   ~LocalCudaAllocator() override;
 
   void *allocate(size_t) override;
+  void *allocate_and_zero(size_t) override;
   void deallocate(void *) override;
 
 private:
diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h
index 4bc97f00ef..11d2f1bf5c 100644
--- a/lib/kernels/include/kernels/replicate_kernels_cpu.h
+++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h
@@ -7,16 +7,14 @@
 namespace FlexFlow {
 namespace Kernels {
 namespace Replicate {
-namespace CPU {
 
-void forward_kernel(GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output);
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
 
-void backward_kernel(GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output,
-                     size_t num_replicas);
+void cpu_backward_kernel(GenericTensorAccessorW const &input,
+                         GenericTensorAccessorR const &output,
+                         size_t num_replicas);
 
-} // namespace CPU
 } // namespace Replicate
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h
index 89ed6ffdb4..bb17aa9400 100644
--- a/lib/kernels/include/kernels/reverse_kernels_cpu.h
+++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h
@@ -6,22 +6,20 @@
 namespace FlexFlow {
 namespace Kernels {
 namespace Reverse {
-namespace CPU {
 
-void forward_kernel(float const *in_ptr,
-                    float *out_ptr,
-                    coord_t num_out_blks,
-                    coord_t reverse_dim_size,
-                    coord_t in_blk_size,
-                    coord_t output_size);
+void cpu_forward_kernel(float const *in_ptr,
+                        float *out_ptr,
+                        coord_t num_out_blks,
+                        coord_t reverse_dim_size,
+                        coord_t in_blk_size,
+                        coord_t output_size);
 
-void backward_kernel(float const *out_grad_ptr,
-                     float *in_grad_ptr,
-                     coord_t num_out_blks,
-                     coord_t reverse_dim_size,
-                     coord_t in_blk_size,
-                     coord_t input_size);
-} // namespace CPU
+void cpu_backward_kernel(float const *out_grad_ptr,
+                         float *in_grad_ptr,
+                         coord_t num_out_blks,
+                         coord_t reverse_dim_size,
+                         coord_t in_blk_size,
+                         coord_t input_size);
 } // namespace Reverse
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
index 27b7eb390d..66d3c02300 100644
--- a/lib/kernels/src/accessor.cc
+++ b/lib/kernels/src/accessor.cc
@@ -156,8 +156,10 @@ std::vector<half const *>
 
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &writable) {
-  return GenericTensorAccessorR{
-      writable.data_type, writable.shape, req<void const *>(writable.ptr)};
+  return GenericTensorAccessorR{writable.data_type,
+                                writable.shape,
+                                req<void const *>(writable.ptr),
+                                writable.on_device};
 }
 
 bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
index ccd88580db..ce06fbabe0 100644
--- a/lib/kernels/src/allocation.cc
+++ b/lib/kernels/src/allocation.cc
@@ -7,6 +7,10 @@ void *Allocator::allocate(size_t mem_size) {
   return this->i_allocator->allocate(mem_size);
 }
 
+void *Allocator::allocate_and_zero(size_t mem_size) {
+  return this->i_allocator->allocate_and_zero(mem_size);
+}
+
 void Allocator::deallocate(void *ptr) {
   this->i_allocator->deallocate(ptr);
 }
@@ -14,7 +18,15 @@ void Allocator::deallocate(void *ptr) {
 GenericTensorAccessorW
     Allocator::allocate_tensor(TensorShape const &tensor_shape) {
   void *ptr = this->allocate(get_size_in_bytes(tensor_shape));
-  return {tensor_shape.data_type, tensor_shape, ptr};
+  bool on_device = this->alloc_location == AllocLocation::DEVICE;
+  return {tensor_shape.data_type, tensor_shape, ptr, on_device};
+}
+
+GenericTensorAccessorW
+    Allocator::allocate_tensor_and_zero(TensorShape const &tensor_shape) {
+  void *ptr = this->allocate_and_zero(get_size_in_bytes(tensor_shape));
+  bool on_device = this->alloc_location == AllocLocation::DEVICE;
+  return {tensor_shape.data_type, tensor_shape, ptr, on_device};
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc
index cf73a84b93..5888d9a96a 100644
--- a/lib/kernels/src/cpu/cast_kernels.cc
+++ b/lib/kernels/src/cpu/cast_kernels.cc
@@ -4,56 +4,55 @@
 namespace FlexFlow {
 namespace Kernels {
 namespace Cast {
-namespace CPU {
 
 template <typename IDT, typename ODT>
-void cast_forward(IDT const *input, ODT *output, size_t volume) {
+void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) {
   for (size_t i = 0; i < volume; ++i) {
     output[i] = static_cast<ODT>(input[i]);
   }
 }
 
 template <typename IDT, typename ODT>
-void cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) {
+void cpu_cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) {
   for (size_t i = 0; i < volume; i++) {
     output[i] = static_cast<ODT>(input[i]) + beta * output[i];
   }
 }
 
 template <DataType IDT, DataType ODT>
-struct ForwardKernel {
+struct CPUForwardKernel {
   void operator()(GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
     size_t volume = input.shape.get_volume();
-    cast_forward(input.get<IDT>(), output.get<ODT>(), volume);
+    cpu_cast_forward(input.get<IDT>(), output.get<ODT>(), volume);
   }
 };
 
 template <DataType IDT, DataType ODT>
-struct BackwardKernel {
+struct CPUBackwardKernel {
   void operator()(GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
     size_t volume = input.shape.get_volume();
-    cast_backward(
+    cpu_cast_backward(
         input.get<IDT>(), output.get<ODT>(), volume, cast_to<ODT>(1.0f));
   }
 };
 
-void forward_kernel(GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    DataType input_type,
-                    DataType output_type) {
-  DataTypeDispatch2<ForwardKernel>{}(input_type, output_type, input, output);
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output,
+                        DataType input_type,
+                        DataType output_type) {
+  DataTypeDispatch2<CPUForwardKernel>{}(input_type, output_type, input, output);
 }
 
-void backward_kernel(GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output,
-                     DataType input_type,
-                     DataType output_type) {
-  DataTypeDispatch2<BackwardKernel>{}(input_type, output_type, input, output);
+void cpu_backward_kernel(GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &output,
+                         DataType input_type,
+                         DataType output_type) {
+  DataTypeDispatch2<CPUBackwardKernel>{}(
+      input_type, output_type, input, output);
 }
 
-} // namespace CPU
 } // namespace Cast
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc
index f1950a56d2..e48f4c3e01 100644
--- a/lib/kernels/src/cpu/combine_kernels.cc
+++ b/lib/kernels/src/cpu/combine_kernels.cc
@@ -4,10 +4,9 @@
 namespace FlexFlow {
 namespace Kernels {
 namespace Combine {
-namespace CPU {
 
 template <DataType DT>
-struct ForwardKernel {
+struct CPUForwardKernel {
   void operator()(GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
     memcpy(output.get<DT>(),
@@ -17,7 +16,7 @@ struct ForwardKernel {
 };
 
 template <DataType DT>
-struct BackwardKernel {
+struct CPUBackwardKernel {
   void operator()(GenericTensorAccessorR const &output_grad,
                   GenericTensorAccessorW const &input_grad) {
     size_t num_elements = output_grad.shape.get_volume();
@@ -27,18 +26,17 @@ struct BackwardKernel {
   }
 };
 
-void forward_kernel(GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output) {
-  DataTypeDispatch1<ForwardKernel>{}(input.data_type, input, output);
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
 }
 
-void backward_kernel(GenericTensorAccessorR const &output_grad,
-                     GenericTensorAccessorW const &input_grad) {
-  DataTypeDispatch1<BackwardKernel>{}(
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad) {
+  DataTypeDispatch1<CPUBackwardKernel>{}(
       input_grad.data_type, output_grad, input_grad);
 }
 
-} // namespace CPU
 } // namespace Combine
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc
index a26d2054d1..239baf4041 100644
--- a/lib/kernels/src/cpu/replicate_kernels.cc
+++ b/lib/kernels/src/cpu/replicate_kernels.cc
@@ -4,13 +4,12 @@
 namespace FlexFlow {
 namespace Kernels {
 namespace Replicate {
-namespace CPU {
 
 template <typename T>
-void replicate_backward_kernel(T *input,
-                               T const *output,
-                               size_t num_elements,
-                               size_t num_replicas) {
+void cpu_replicate_backward_kernel(T *input,
+                                   T const *output,
+                                   size_t num_elements,
+                                   size_t num_replicas) {
   for (size_t i = 0; i < num_elements; ++i) {
     T sum = 0;
     for (size_t j = 0; j < num_replicas; ++j) {
@@ -23,7 +22,7 @@ void replicate_backward_kernel(T *input,
 // Why does replicate forward seem to only transfer memory? Shouldn't it also
 // handle the replication?
 template <DataType T>
-struct ForwardKernel {
+struct CPUForwardKernel {
   void operator()(GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
     memcpy(output.get<T>(),
@@ -33,29 +32,28 @@ struct ForwardKernel {
 };
 
 template <DataType T>
-struct BackwardKernel {
+struct CPUBackwardKernel {
   void operator()(GenericTensorAccessorW const &input,
                   GenericTensorAccessorR const &output,
                   size_t num_replicas) {
     size_t total_elements = input.shape.num_elements() * num_replicas;
-    replicate_backward_kernel(
+    cpu_replicate_backward_kernel(
         input.get<T>(), output.get<T>(), total_elements, num_replicas);
   }
 };
 
-void forward_kernel(GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output) {
-  DataTypeDispatch1<ForwardKernel>{}(input.data_type, input, output);
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
 }
 
-void backward_kernel(GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output,
-                     size_t num_replicas) {
-  DataTypeDispatch1<BackwardKernel>{}(
+void cpu_backward_kernel(GenericTensorAccessorW const &input,
+                         GenericTensorAccessorR const &output,
+                         size_t num_replicas) {
+  DataTypeDispatch1<CPUBackwardKernel>{}(
       input.data_type, input, output, num_replicas);
 }
 
-} // namespace CPU
 } // namespace Replicate
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc
index b035f03721..350dad03e9 100644
--- a/lib/kernels/src/cpu/reverse_kernels.cc
+++ b/lib/kernels/src/cpu/reverse_kernels.cc
@@ -1,48 +1,78 @@
 #include "kernels/reverse_kernels_cpu.h"
+#include <algorithm>
+#include <vector>
 
 namespace FlexFlow {
 namespace Kernels {
 namespace Reverse {
-namespace CPU {
 
-void reverse_forward_kernel(float const *in_ptr,
-                            float *out_ptr,
-                            coord_t num_out_blks,
-                            coord_t reverse_dim_size,
-                            coord_t in_blk_size) {
+void cpu_reverse_forward_kernel(float const *in_ptr,
+                                float *out_ptr,
+                                coord_t num_out_blks,
+                                coord_t reverse_dim_size,
+                                coord_t in_blk_size) {
   coord_t total_elements = num_out_blks * reverse_dim_size * in_blk_size;
-  for (coord_t i = 0; i < total_elements; ++i) {
-    coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
-    coord_t offset = i - blk_idx * (reverse_dim_size * in_blk_size);
-    coord_t reverse_dim_idx = offset / in_blk_size;
-    coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
-                     (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size +
-                     (offset % in_blk_size);
-    out_ptr[i] = in_ptr[in_idx];
+
+  std::vector<std::vector<float>> in_blocks(num_out_blks * reverse_dim_size,
+                                            std::vector<float>(in_blk_size));
+
+  // For each output block, copy the input block into in_blocks
+  for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) {
+    // Each output block has reverse_dim_size input blocks
+    for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) {
+      coord_t start_idx = (blk_idx * reverse_dim_size + rev_idx) * in_blk_size;
+
+      // Copy elements from in_ptr to the current block in in_blocks
+      std::vector<float> &current_block =
+          in_blocks[blk_idx * reverse_dim_size + rev_idx];
+      for (coord_t i = 0; i < in_blk_size; ++i) {
+        current_block[i] = in_ptr[start_idx + i];
+      }
+    }
+  }
+
+  // Reverse the in_blocks within each output block
+  for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) {
+    auto block_start = in_blocks.begin() + blk_idx * reverse_dim_size;
+    auto block_end = block_start + reverse_dim_size;
+    std::reverse(block_start, block_end);
+  }
+
+  // Copy the reversed blocks to the output array
+  for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) {
+    for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) {
+      coord_t start_idx = (blk_idx * reverse_dim_size + rev_idx) * in_blk_size;
+
+      // Copy elements from the current block in in_blocks to out_ptr
+      std::vector<float> const &current_block =
+          in_blocks[blk_idx * reverse_dim_size + rev_idx];
+      for (coord_t i = 0; i < in_blk_size; ++i) {
+        out_ptr[start_idx + i] = current_block[i];
+      }
+    }
   }
 }
 
-void forward_kernel(float const *in_ptr,
-                    float *out_ptr,
-                    coord_t num_out_blks,
-                    coord_t reverse_dim_size,
-                    coord_t in_blk_size,
-                    coord_t output_size) {
-  reverse_forward_kernel(
+void cpu_forward_kernel(float const *in_ptr,
+                        float *out_ptr,
+                        coord_t num_out_blks,
+                        coord_t reverse_dim_size,
+                        coord_t in_blk_size,
+                        coord_t output_size) {
+  cpu_reverse_forward_kernel(
       in_ptr, out_ptr, num_out_blks, reverse_dim_size, in_blk_size);
 }
 
-void backward_kernel(float const *out_grad_ptr,
-                     float *in_grad_ptr,
-                     coord_t num_out_blks,
-                     coord_t reverse_dim_size,
-                     coord_t in_blk_size,
-                     coord_t input_size) {
-  reverse_forward_kernel(
+void cpu_backward_kernel(float const *out_grad_ptr,
+                         float *in_grad_ptr,
+                         coord_t num_out_blks,
+                         coord_t reverse_dim_size,
+                         coord_t in_blk_size,
+                         coord_t input_size) {
+  cpu_reverse_forward_kernel(
       out_grad_ptr, in_grad_ptr, num_out_blks, reverse_dim_size, in_blk_size);
 }
 
-} // namespace CPU
 } // namespace Reverse
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc
index 9cc86c44ca..ced707edcc 100644
--- a/lib/kernels/src/local_cpu_allocator.cc
+++ b/lib/kernels/src/local_cpu_allocator.cc
@@ -3,6 +3,18 @@
 
 namespace FlexFlow {
 void *LocalCPUAllocator::allocate(size_t requested_memory_size) {
+  void *ptr = malloc(requested_memory_size);
+
+  if (ptr != nullptr) {
+    this->ptrs.insert(ptr);
+  } else {
+    throw std::bad_alloc();
+  }
+
+  return ptr;
+}
+
+void *LocalCPUAllocator::allocate_and_zero(size_t requested_memory_size) {
   void *ptr = calloc(1, requested_memory_size);
 
   if (ptr != nullptr) {
@@ -25,13 +37,15 @@ void LocalCPUAllocator::deallocate(void *ptr) {
 }
 
 LocalCPUAllocator::~LocalCPUAllocator() {
-  for (auto ptr : ptrs) {
+  for (void *ptr : this->ptrs) {
     free(ptr);
   }
 }
 
 Allocator create_local_cpu_memory_allocator() {
-  return Allocator::create<LocalCPUAllocator>();
+  Allocator allocator = Allocator::create<LocalCPUAllocator>();
+  allocator.alloc_location = AllocLocation::HOST;
+  return allocator;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc
index dad101c64c..b6c615a5ca 100644
--- a/lib/kernels/src/local_cuda_allocator.cc
+++ b/lib/kernels/src/local_cuda_allocator.cc
@@ -4,6 +4,13 @@
 
 namespace FlexFlow {
 void *LocalCudaAllocator::allocate(size_t requested_memory_size) {
+  void *ptr;
+  checkCUDA(cudaMalloc(&ptr, requested_memory_size));
+  this->ptrs.insert(ptr);
+  return ptr;
+}
+
+void *LocalCudaAllocator::allocate_and_zero(size_t requested_memory_size) {
   void *ptr;
   checkCUDA(cudaMalloc(&ptr, requested_memory_size));
   checkCUDA(cudaMemset(ptr, 0, requested_memory_size));
@@ -28,7 +35,9 @@ LocalCudaAllocator::~LocalCudaAllocator() {
 }
 
 Allocator create_local_cuda_memory_allocator() {
-  return Allocator::create<LocalCudaAllocator>();
+  Allocator allocator = Allocator::create<LocalCudaAllocator>();
+  allocator.alloc_location = AllocLocation::DEVICE;
+  return allocator;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index 023233ecb0..c4a3f7bd50 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -47,13 +47,16 @@ TEST_SUITE(FF_TEST_SUITE) {
         make_tensor_shape_from_legion_dims({state.weightSize}, DataType::FLOAT);
 
     GenericTensorAccessorW query_accessor =
-        create_random_filled_accessor_w(query_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(query_shape,
+                                                         allocator);
     GenericTensorAccessorW key_accessor =
-        create_random_filled_accessor_w(key_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(key_shape, allocator);
     GenericTensorAccessorW value_accessor =
-        create_random_filled_accessor_w(value_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(value_shape,
+                                                         allocator);
     GenericTensorAccessorW weight_accessor =
-        create_random_filled_accessor_w(weight_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(weight_shape,
+                                                         allocator);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW output_accessor =
@@ -73,15 +76,20 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW query_grad_accessor =
-          create_random_filled_accessor_w(query_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(query_shape,
+                                                           allocator);
       GenericTensorAccessorW key_grad_accessor =
-          create_random_filled_accessor_w(key_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(key_shape,
+                                                           allocator);
       GenericTensorAccessorW value_grad_accessor =
-          create_random_filled_accessor_w(value_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(value_shape,
+                                                           allocator);
       GenericTensorAccessorW weight_grad_accessor =
-          create_random_filled_accessor_w(weight_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(weight_shape,
+                                                           allocator);
       GenericTensorAccessorW output_grad_accessor =
-          create_random_filled_accessor_w(output_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                           allocator);
 
       Kernels::MultiHeadAttention::backward_kernel(
           managed_stream.raw_stream(),
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index 8a11a069f5..bb9c4c07bd 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -29,11 +29,14 @@ TEST_SUITE(FF_TEST_SUITE) {
         make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT);
 
     GenericTensorAccessorW a_accessor =
-        create_random_filled_accessor_w(input_shape_a, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(input_shape_a,
+                                                         allocator);
     GenericTensorAccessorW b_accessor =
-        create_random_filled_accessor_w(input_shape_b, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(input_shape_b,
+                                                         allocator);
     GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w(output_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                         allocator);
 
     SUBCASE("forward_kernel") {
       Kernels::BatchMatmul::forward_kernel(managed_stream.raw_stream(),
@@ -52,7 +55,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW o_grad_accessor =
-          create_random_filled_accessor_w(output_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                           allocator);
       GenericTensorAccessorW a_grad_accessor =
           allocator.allocate_tensor(input_shape_a);
       GenericTensorAccessorW b_grad_accessor =
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index 03a3a1ad40..43bcc5528a 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -36,7 +36,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         {output_n, output_c, output_h, output_w}, DataType::FLOAT);
 
     GenericTensorAccessorW input_accessor =
-        create_random_filled_accessor_w(input_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                         allocator);
     GenericTensorAccessorW output_accessor =
         create_random_filled_accessor_w(output_shape, allocator);
     GenericTensorAccessorW scale_accessor = create_filled_accessor_w(
@@ -58,13 +59,17 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_accessor =
-          create_random_filled_accessor_w(output_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                           allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           allocator);
       GenericTensorAccessorW scale_grad_accessor =
-          create_random_filled_accessor_w(scale_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(scale_shape,
+                                                           allocator);
       GenericTensorAccessorW bias_grad_accessor =
-          create_random_filled_accessor_w(bias_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(bias_shape,
+                                                           allocator);
 
       Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(),
                                           state,
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index b427b493b8..a6990d2ad0 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -83,9 +83,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims<DataType::FLOAT>({100, 100});
+        make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT);
     TensorShape output_shape =
-        make_tensor_shape_from_legion_dims<DataType::INT32>({100, 100});
+        make_tensor_shape_from_legion_dims({100, 100}, DataType::INT32);
 
     GenericTensorAccessorW output_accessor_gpu =
         gpu_allocator.allocate_tensor(output_shape);
@@ -102,31 +102,34 @@ TEST_SUITE(FF_TEST_SUITE) {
       // Run GPU Forward Kernel
       GenericTensorAccessorW input_accessor_gpu =
           create_transformed_accessor_w<float, float>(
-              input_shape, gpu_allocator, transform, false);
+              input_shape, gpu_allocator, transform);
       Kernels::Cast::forward_kernel(
           managed_stream.raw_stream(),
           read_only_accessor_from_write_accessor(input_accessor_gpu),
           output_accessor_gpu,
           DataType::FLOAT,
           DataType::INT32);
+      std::cout << "Before GPU load" << std::endl;
       std::vector<int32_t> result_data_gpu =
-          load_accessor_data<DataType::INT32>(
-              read_only_accessor_from_write_accessor(output_accessor_gpu),
-              false);
+          load_accessor_data<DataType::INT32>(output_accessor_gpu);
 
       // Run CPU Forward Kernel
       GenericTensorAccessorW input_accessor_cpu =
           create_transformed_accessor_w<float, float>(
-              input_shape, cpu_allocator, transform, true);
-      Kernels::Cast::CPU::forward_kernel(
+              input_shape, cpu_allocator, transform);
+      Kernels::Cast::cpu_forward_kernel(
           read_only_accessor_from_write_accessor(input_accessor_cpu),
           output_accessor_cpu,
           DataType::FLOAT,
           DataType::INT32);
+      std::cout << "Before CPU load" << std::endl;
+      if (output_accessor_cpu.on_device) {
+        std::cout << "CPU data is on device" << std::endl;
+      } else {
+        std::cout << "CPU data is on host" << std::endl;
+      }
       std::vector<int32_t> result_data_cpu =
-          load_accessor_data<DataType::INT32>(
-              read_only_accessor_from_write_accessor(output_accessor_cpu),
-              true);
+          load_accessor_data<DataType::INT32>(output_accessor_cpu);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 4be2bdf7bb..7ff364bada 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -47,9 +47,11 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_data =
-          create_random_filled_accessor_w(output_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                           allocator);
       GenericTensorAccessorW input_grad_data =
-          create_random_filled_accessor_w(input_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           allocator);
 
       Kernels::Dropout::backward_kernel(managed_stream.raw_stream(),
                                         state,
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 7f97563217..4f05c89813 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -41,7 +41,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           allocator);
 
       Kernels::Gather::backward_kernel(managed_stream.raw_stream(),
                                        state,
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 7d7298f83d..3ac0e1425f 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -56,7 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           allocator);
       GenericTensorAccessorW gamma_grad_accessor =
           allocator.allocate_tensor(feature_shape);
       GenericTensorAccessorW beta_grad_accessor =
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index 00fa968235..f71d9cfa11 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -45,9 +45,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         {output_w, output_h, output_c, output_n}, DataType::FLOAT);
 
     GenericTensorAccessorW input_accessor =
-        create_random_filled_accessor_w(input_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                         allocator);
     GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w(output_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                         allocator);
 
     SUBCASE("forward_kernel") {
       Kernels::Pool2D::forward_kernel(managed_stream.raw_stream(),
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 65f02f4bc9..e952f1107f 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -118,11 +118,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     // reduced shape, but things are weird cause doesn't seem to be replicating
     // anything (ie. input shape should be same as reduced shape)
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims<DataType::FLOAT>({10, num_replicas});
+        make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT);
     TensorShape replicated_shape =
-        make_tensor_shape_from_legion_dims<DataType::FLOAT>({10, num_replicas});
+        make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT);
     TensorShape reduced_shape =
-        make_tensor_shape_from_legion_dims<DataType::FLOAT>({10});
+        make_tensor_shape_from_legion_dims({10}, DataType::FLOAT);
 
     ManagedPerDeviceFFHandle managed_handle{};
     ManagedFFStream managed_stream{};
@@ -133,30 +133,30 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("forward_kernel") {
       // Run GPU Replicate Forward Kernel
       GenericTensorAccessorR input_accessor_gpu =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, gpu_allocator));
+          create_random_filled_accessor_r<DataType::FLOAT>(input_shape,
+                                                           gpu_allocator);
       GenericTensorAccessorW output_accessor_gpu =
           gpu_allocator.allocate_tensor(replicated_shape);
 
       Kernels::Replicate::forward_kernel(
           managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
 
-      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_gpu), false);
+      std::vector<float> result_data_gpu =
+          load_accessor_data<DataType::FLOAT>(output_accessor_gpu);
 
       // Run CPU Replicate Forward Kernel
       GenericTensorAccessorW input_accessor_cpu =
-          copy_tensor_between_memories<DataType::FLOAT>(
-              input_accessor_gpu, input_shape, cpu_allocator);
+          copy_tensor_between_memories<DataType::FLOAT>(input_accessor_gpu,
+                                                        cpu_allocator);
       GenericTensorAccessorW output_accessor_cpu =
           cpu_allocator.allocate_tensor(replicated_shape);
 
-      Kernels::Replicate::CPU::forward_kernel(
+      Kernels::Replicate::cpu_forward_kernel(
           read_only_accessor_from_write_accessor(input_accessor_cpu),
           output_accessor_cpu);
 
-      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_cpu), true);
+      std::vector<float> result_data_cpu =
+          load_accessor_data<DataType::FLOAT>(output_accessor_cpu);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
@@ -164,35 +164,33 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("backward_kernel") {
       // Run GPU Replicate Backward Kernel
       GenericTensorAccessorR output_grad_accessor_gpu =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(replicated_shape, gpu_allocator));
+          create_random_filled_accessor_r<DataType::FLOAT>(replicated_shape,
+                                                           gpu_allocator);
       GenericTensorAccessorW input_grad_accessor_gpu =
-          gpu_allocator.allocate_tensor(reduced_shape);
+          gpu_allocator.allocate_tensor_and_zero(reduced_shape);
 
       Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
                                           input_grad_accessor_gpu,
                                           output_grad_accessor_gpu,
                                           num_replicas);
 
-      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(input_grad_accessor_gpu),
-          false);
+      std::vector<float> result_data_gpu =
+          load_accessor_data<DataType::FLOAT>(input_grad_accessor_gpu);
 
       // Run CPU Replicate Backward Kernel
       GenericTensorAccessorW output_grad_accessor_cpu =
           copy_tensor_between_memories<DataType::FLOAT>(
-              output_grad_accessor_gpu, replicated_shape, cpu_allocator);
+              output_grad_accessor_gpu, cpu_allocator);
       GenericTensorAccessorW input_grad_accessor_cpu =
-          cpu_allocator.allocate_tensor(reduced_shape);
+          cpu_allocator.allocate_tensor_and_zero(reduced_shape);
 
-      Kernels::Replicate::CPU::backward_kernel(
+      Kernels::Replicate::cpu_backward_kernel(
           input_grad_accessor_cpu,
           read_only_accessor_from_write_accessor(output_grad_accessor_cpu),
           num_replicas);
 
-      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(input_grad_accessor_cpu),
-          true);
+      std::vector<float> result_data_cpu =
+          load_accessor_data<DataType::FLOAT>(input_grad_accessor_cpu);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index f37bbba941..7899afa718 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -42,7 +42,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_accessor =
-          create_random_filled_accessor_w(output_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                           allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
@@ -142,9 +143,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     std::size_t reverse_dim_size = 3;
     std::size_t in_blk_size = 5;
 
-    TensorShape input_shape =
-        make_tensor_shape_from_legion_dims<DataType::FLOAT>(
-            {num_out_blks, reverse_dim_size, in_blk_size});
+    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     ManagedPerDeviceFFHandle managed_handle{};
@@ -161,7 +161,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       // Run GPU Cast Forward Kernel
       GenericTensorAccessorW input_accessor_gpu =
           create_transformed_accessor_w<float, float>(
-              input_shape, gpu_allocator, transform, false);
+              input_shape, gpu_allocator, transform);
       GenericTensorAccessorW output_accessor_gpu =
           gpu_allocator.allocate_tensor(output_shape);
 
@@ -173,17 +173,17 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        in_blk_size,
                                        input_accessor_gpu.shape.num_elements());
 
-      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_gpu), false);
+      std::vector<float> result_data_gpu =
+          load_accessor_data<DataType::FLOAT>(output_accessor_gpu);
 
       // Run CPU Cast Forward Kernel
       GenericTensorAccessorW input_accessor_cpu =
           create_transformed_accessor_w<float, float>(
-              input_shape, cpu_allocator, transform, true);
+              input_shape, cpu_allocator, transform);
       GenericTensorAccessorW output_accessor_cpu =
           cpu_allocator.allocate_tensor(output_shape);
 
-      Kernels::Reverse::CPU::forward_kernel(
+      Kernels::Reverse::cpu_forward_kernel(
           input_accessor_cpu.get_float_ptr(),
           output_accessor_cpu.get_float_ptr(),
           num_out_blks,
@@ -191,8 +191,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           in_blk_size,
           input_accessor_cpu.shape.num_elements());
 
-      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_cpu), true);
+      std::vector<float> result_data_cpu =
+          load_accessor_data<DataType::FLOAT>(output_accessor_cpu);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
@@ -200,7 +200,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("backward_kernel") {
       // Run GPU Cast Backward Kernel
       GenericTensorAccessorW output_grad_accessor_gpu =
-          create_random_filled_accessor_w(output_shape, gpu_allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                           gpu_allocator);
       GenericTensorAccessorW input_grad_accessor_gpu =
           gpu_allocator.allocate_tensor(input_shape);
 
@@ -213,20 +214,18 @@ TEST_SUITE(FF_TEST_SUITE) {
           in_blk_size,
           input_grad_accessor_gpu.shape.num_elements());
 
-      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(input_grad_accessor_gpu),
-          false);
+      std::vector<float> result_data_gpu =
+          load_accessor_data<DataType::FLOAT>(input_grad_accessor_gpu);
 
       // Run CPU Cast Backward Kernel
       GenericTensorAccessorW output_grad_accessor_cpu =
           copy_tensor_between_memories<DataType::FLOAT>(
               read_only_accessor_from_write_accessor(output_grad_accessor_gpu),
-              output_shape,
               cpu_allocator);
       GenericTensorAccessorW input_grad_accessor_cpu =
           cpu_allocator.allocate_tensor(input_shape);
 
-      Kernels::Reverse::CPU::backward_kernel(
+      Kernels::Reverse::cpu_backward_kernel(
           output_grad_accessor_cpu.get_float_ptr(),
           input_grad_accessor_cpu.get_float_ptr(),
           num_out_blks,
@@ -234,9 +233,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           in_blk_size,
           input_grad_accessor_cpu.shape.num_elements());
 
-      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(input_grad_accessor_cpu),
-          true);
+      std::vector<float> result_data_cpu =
+          load_accessor_data<DataType::FLOAT>(input_grad_accessor_cpu);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index 5519c30b80..88f24a1a08 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -23,11 +23,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         managed_handle.raw_handle(), 0, input_n, channels, input_h, input_w);
 
     GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w(output_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                         allocator);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           allocator);
 
       Kernels::Softmax::forward_kernel(managed_stream.raw_stream(),
                                        state,
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index 34993fa151..9f1d390501 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -27,7 +27,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           allocator);
 
       std::vector<float *> output_ptrs = repeat(num_outputs, [&]() {
         GenericTensorAccessorW output_accessor =
@@ -48,7 +49,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<float *> output_grad_ptrs(num_outputs);
       for (int i = 0; i < num_outputs; i++) {
         GenericTensorAccessorW output_grad_accessor =
-            create_random_filled_accessor_w(output_shape, allocator);
+            create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                             allocator);
         output_grad_ptrs[i] = output_grad_accessor.get_float_ptr();
       }
 
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index 0bc85cb8e0..c8baaac54f 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -39,7 +39,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           allocator);
 
       Kernels::Transpose::backward_kernel(managed_stream.raw_stream(),
                                           state,
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
index 80720801b6..d4511c9dc5 100644
--- a/lib/kernels/test/src/test_utils.h
+++ b/lib/kernels/test/src/test_utils.h
@@ -14,55 +14,74 @@
 
 using namespace FlexFlow;
 
-enum class GpuDirection {
-  HostToDevice = 0,
-  DeviceToHost = 1,
-  DeviceToDevice = 2
-};
-
 template <typename DT>
-void transfer_memory(DT *dst,
+void transfer_memory(GenericTensorAccessorW dst_accessor,
                      const DT *src,
-                     size_t num_elements,
-                     GpuDirection gpu_dir,
-                     bool cpu_memory) {
-  size_t bytes = num_elements * sizeof(DT);
-
-  if (cpu_memory) {
-    memcpy(dst, src, bytes);
+                     AllocLocation src_loc) {
+  size_t bytes = dst_accessor.shape.get_volume() * sizeof(DT);
+  AllocLocation dst_loc =
+      dst_accessor.on_device ? AllocLocation::DEVICE : AllocLocation::HOST;
+
+  if (src_loc == AllocLocation::HOST && dst_loc == AllocLocation::HOST) {
+    memcpy(dst_accessor.ptr, src, bytes);
+  } else if (src_loc == AllocLocation::HOST &&
+             dst_loc == AllocLocation::DEVICE) {
+    checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyHostToDevice));
+  } else if (src_loc == AllocLocation::DEVICE &&
+             dst_loc == AllocLocation::HOST) {
+    checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToHost));
   } else {
-    switch (gpu_dir) {
-      case GpuDirection::HostToDevice:
-        checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice));
-        break;
-      case GpuDirection::DeviceToHost:
-        checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost));
-        break;
-      case GpuDirection::DeviceToDevice:
-        checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
-        break;
-    }
+    checkCUDA(
+        cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToDevice));
   }
 }
 
+template <DataType DT>
 GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
-                                                       Allocator &allocator,
-                                                       bool on_host = false);
+                                                       Allocator &allocator) {
+  assert(shape.data_type == DataType::FLOAT ||
+         shape.data_type == DataType::DOUBLE);
+  using T = real_type<DT>;
+
+  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
+  accessor.on_device =
+      (allocator.alloc_location == AllocLocation::DEVICE) ? true : false;
+
+  std::vector<T> host_data(accessor.shape.num_elements());
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<T> dist(-1.0, 1.0);
+
+  for (auto &val : host_data) {
+    val = dist(gen);
+  }
+
+  transfer_memory(accessor, host_data.data(), AllocLocation::HOST);
+
+  return accessor;
+}
+
+template <DataType DT>
+GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape,
+                                                       Allocator &allocator) {
+  GenericTensorAccessorW accessor =
+      create_random_filled_accessor_w<DT>(shape, allocator);
+
+  return read_only_accessor_from_write_accessor(accessor);
+}
 
 template <typename DT>
 GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
                                                 Allocator &allocator,
-                                                DT val,
-                                                bool on_host = false) {
+                                                DT val) {
   GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements();
+  accessor.on_device =
+      (allocator.alloc_location == AllocLocation::DEVICE) ? true : false;
+
+  size_t volume = accessor.shape.get_volume();
   std::vector<DT> host_data(volume, val);
 
-  transfer_memory(static_cast<DT *>(accessor.ptr),
-                  host_data.data(),
-                  volume,
-                  GpuDirection::HostToDevice,
-                  on_host);
+  transfer_memory(accessor, host_data.data(), AllocLocation::HOST);
 
   return accessor;
 }
@@ -70,9 +89,11 @@ GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
 template <typename IDT, typename ODT, typename F>
 GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape,
                                                      Allocator &allocator,
-                                                     F transform,
-                                                     bool on_host = false) {
+                                                     F transform) {
   GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
+  accessor.on_device =
+      (allocator.alloc_location == AllocLocation::DEVICE) ? true : false;
+
   size_t volume = accessor.shape.get_volume();
   std::vector<IDT> input_data(volume);
   std::vector<ODT> output_data(volume);
@@ -80,11 +101,7 @@ GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape,
   std::transform(
       input_data.begin(), input_data.end(), output_data.begin(), transform);
 
-  transfer_memory(static_cast<ODT *>(accessor.ptr),
-                  output_data.data(),
-                  volume,
-                  GpuDirection::HostToDevice,
-                  on_host);
+  transfer_memory(accessor, output_data.data(), AllocLocation::HOST);
 
   return accessor;
 }
@@ -92,42 +109,59 @@ GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape,
 template <DataType DT>
 GenericTensorAccessorW
     copy_tensor_between_memories(GenericTensorAccessorR accessor,
-                                 TensorShape const &shape,
-                                 Allocator &allocator,
-                                 bool src_on_host = false) {
+                                 Allocator &allocator) {
+  TensorShape shape = get_tensor_shape(accessor.shape, accessor.data_type);
   GenericTensorAccessorW copied_accessor = allocator.allocate_tensor(shape);
+  copied_accessor.on_device =
+      (allocator.alloc_location == AllocLocation::DEVICE) ? true : false;
 
-  size_t volume = accessor.shape.get_volume();
-  GpuDirection gpu_dir =
-      src_on_host ? GpuDirection::HostToDevice : GpuDirection::DeviceToHost;
+  AllocLocation src_loc =
+      accessor.on_device ? AllocLocation::DEVICE : AllocLocation::HOST;
 
-  transfer_memory(
-      copied_accessor.get<DT>(), accessor.get<DT>(), volume, gpu_dir, false);
+  transfer_memory(copied_accessor, accessor.get<DT>(), src_loc);
 
   return copied_accessor;
 }
 
-template <DataType DT>
-TensorShape make_tensor_shape_from_legion_dims(FFOrdered<size_t> dims) {
-  return TensorShape{
-      TensorDims{
-          dims,
-      },
-      DT,
-  };
-}
+TensorShape make_tensor_shape_from_legion_dims(FFOrdered<size_t> dims,
+                                               DataType DT);
 
 template <DataType DT>
-std::vector<real_type<DT>> load_accessor_data(GenericTensorAccessorR accessor,
-                                              bool on_host = false) {
+std::vector<real_type<DT>> load_accessor_data(GenericTensorAccessorR accessor) {
+  using T = real_type<DT>;
+
   int volume = accessor.shape.get_volume();
+  std::vector<T> local_data(volume);
+  T const *src_ptr = accessor.get<DT>();
+
+  if (accessor.on_device) {
+    checkCUDA(cudaMemcpy(local_data.data(),
+                         src_ptr,
+                         volume * sizeof(T),
+                         cudaMemcpyDeviceToHost));
+  } else {
+    memcpy(local_data.data(), src_ptr, volume * sizeof(T));
+  }
+
+  return local_data;
+}
 
+template <DataType DT>
+std::vector<real_type<DT>> load_accessor_data(GenericTensorAccessorW accessor) {
   using T = real_type<DT>;
+
+  int volume = accessor.shape.get_volume();
   std::vector<T> local_data(volume);
   T const *src_ptr = accessor.get<DT>();
 
-  transfer_memory(
-      local_data.data(), src_ptr, volume, GpuDirection::DeviceToHost, on_host);
+  if (accessor.on_device) {
+    checkCUDA(cudaMemcpy(local_data.data(),
+                         src_ptr,
+                         volume * sizeof(T),
+                         cudaMemcpyDeviceToHost));
+  } else {
+    memcpy(local_data.data(), src_ptr, volume * sizeof(T));
+  }
 
   return local_data;
 }
diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h
index 731e04fdc8..d6f338fe14 100644
--- a/lib/local-execution/include/local-execution/tracked_allocator.h
+++ b/lib/local-execution/include/local-execution/tracked_allocator.h
@@ -12,6 +12,7 @@ struct TrackedAllocator : public IAllocator {
   ~TrackedAllocator() = default;
 
   void *allocate(size_t) override;
+  void *allocate_and_zero(size_t) override;
   void deallocate(void *) override;
   size_t get_current_mem_usage();
 
diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc
index e6c3a11711..9f13f006f3 100644
--- a/lib/local-execution/src/tracked_allocator.cc
+++ b/lib/local-execution/src/tracked_allocator.cc
@@ -12,6 +12,12 @@ void *TrackedAllocator::allocate(size_t requested_memory_size) {
   return ptr;
 }
 
+void *TrackedAllocator::allocate_and_zero(size_t requested_memory_size) {
+  void *ptr = this->allocator.allocate_and_zero(requested_memory_size);
+  this->current_mem_usage += requested_memory_size;
+  return ptr;
+}
+
 void TrackedAllocator::deallocate(void *ptr) {
   size_t psize;
   this->ptr_mem_usage.erase(ptr);
@@ -24,7 +30,9 @@ size_t TrackedAllocator::get_current_mem_usage() {
 }
 
 Allocator get_tracked_memory_allocator(Allocator const &base_allocator) {
-  return Allocator::create<TrackedAllocator>(base_allocator);
+  Allocator allocator = Allocator::create<TrackedAllocator>(base_allocator);
+  allocator.alloc_location = base_allocator.alloc_location;
+  return allocator;
 }
 
 } // namespace FlexFlow

From 64034a585d991703c3f958d263ec82dc8df1b884 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Tue, 8 Oct 2024 00:18:45 -0700
Subject: [PATCH 07/42] cpu_kernel's refactor, generic tensor accessor indexing

---
 lib/kernels/CMakeLists.txt                    |   1 +
 lib/kernels/include/kernels/accessor.h        | 112 ++++++++++++++---
 lib/kernels/include/kernels/allocation.h      |  12 +-
 .../include/kernels/attention_kernels.h       |   6 +-
 .../include/kernels/batch_matmul_kernels.h    |   8 +-
 .../include/kernels/batch_norm_kernels.h      |   6 +-
 lib/kernels/include/kernels/cast_kernels.h    |   8 +-
 .../include/kernels/cast_kernels_cpu.h        |   8 +-
 lib/kernels/include/kernels/combine_kernels.h |   8 +-
 .../include/kernels/combine_kernels_cpu.h     |   8 +-
 lib/kernels/include/kernels/concat_kernels.h  |   8 +-
 lib/kernels/include/kernels/conv_2d_kernels.h |   6 +-
 .../include/kernels/datatype_dispatch.h       |   3 +-
 lib/kernels/include/kernels/dropout_kernels.h |   6 +-
 .../include/kernels/element_binary_kernels.h  |   6 +-
 .../include/kernels/element_unary_kernels.h   |   6 +-
 .../include/kernels/embedding_kernels.h       |   8 +-
 lib/kernels/include/kernels/flat_kernels.h    |   8 +-
 lib/kernels/include/kernels/gather_kernels.h  |   6 +-
 .../include/kernels/layer_norm_kernels.h      |   6 +-
 lib/kernels/include/kernels/linear_kernels.h  |   6 +-
 .../include/kernels/local_cpu_allocator.h     |   7 +-
 .../include/kernels/local_cuda_allocator.h    |   3 +-
 lib/kernels/include/kernels/nccl.h            |   8 +-
 .../include/kernels/partition_kernels.h       |   6 +-
 lib/kernels/include/kernels/pool_2d_kernels.h |   6 +-
 lib/kernels/include/kernels/reduce_kernels.h  |   6 +-
 .../include/kernels/reduction_kernels.h       |   8 +-
 .../include/kernels/replicate_kernels.h       |   8 +-
 .../include/kernels/replicate_kernels_cpu.h   |   8 +-
 lib/kernels/include/kernels/reshape_kernels.h |   6 +-
 lib/kernels/include/kernels/reverse_kernels.h |   8 +-
 .../include/kernels/reverse_kernels_cpu.h     |  23 ++--
 lib/kernels/include/kernels/softmax_kernels.h |   6 +-
 lib/kernels/include/kernels/split_kernels.h   |   9 +-
 lib/kernels/include/kernels/topk_kernels.h    |   6 +-
 .../include/kernels/transpose_kernels.h       |   6 +-
 lib/kernels/src/accessor.cc                   | 118 +++++++++++++++++-
 lib/kernels/src/allocation.cc                 |  19 +--
 lib/kernels/src/array_shape.cc                |   1 +
 lib/kernels/src/cpu/cast_kernels.cc           |   8 +-
 lib/kernels/src/cpu/combine_kernels.cc        |   8 +-
 lib/kernels/src/cpu/replicate_kernels.cc      |  21 ++--
 lib/kernels/src/cpu/reverse_kernels.cc        | 101 +++++++--------
 lib/kernels/src/cuda/ops/concat_kernels.cu    |   8 +-
 lib/kernels/src/local_cpu_allocator.cc        |  31 +----
 lib/kernels/src/local_cuda_allocator.cc       |  13 +-
 lib/kernels/test/CMakeLists.txt               |   1 +
 lib/kernels/test/src/test_cast_kernel.cc      |  24 ++--
 lib/kernels/test/src/test_replicate_kernel.cc |  51 ++++----
 lib/kernels/test/src/test_reverse_kernels.cc  |  53 ++++----
 lib/kernels/test/src/test_utils.h             |  95 ++++----------
 .../local-execution/local_cpu_allocator.h     |   2 +
 .../local-execution/tracked_allocator.h       |   4 +-
 .../src/local_cpu_allocator.cc                |   4 +
 .../src/local_task_argument_accessor.cc       |  11 +-
 lib/local-execution/src/tracked_allocator.cc  |  11 +-
 57 files changed, 473 insertions(+), 481 deletions(-)

diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
index 8ccd7c1011..fc91b7d3db 100644
--- a/lib/kernels/CMakeLists.txt
+++ b/lib/kernels/CMakeLists.txt
@@ -30,6 +30,7 @@ target_link_libraries(
   cudnn
   nccl
   utils
+  pcg
 )
 
 define_ff_vars(${project_target})
diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index e30e1fe825..846115060f 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -5,6 +5,7 @@
 #include "device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/datatype.h"
+#include "pcg/device_type.dtg.h"
 #include "utils/exception.h"
 #include "utils/required.h"
 
@@ -28,20 +29,65 @@ class GenericTensorAccessorW {
   double *get_double_ptr() const;
   half *get_half_ptr() const;
 
-  GenericTensorAccessorW(DataType dt,
-                         ArrayShape sh,
-                         req<void *> p,
-                         bool on_dev = true)
-      : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {}
+  GenericTensorAccessorW() = delete;
+
+  GenericTensorAccessorW(DataType data_type, ArrayShape const &shape, void *ptr, DeviceType device_type);
+
+  bool operator==(GenericTensorAccessorW const &) const;
+  bool operator!=(GenericTensorAccessorW const &) const;
+
+  template <DataType DT, typename... Indices>
+  real_type_t<DT> &at(Indices... indices) {
+    if (this->device_type != DeviceType::CPU) {
+      throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
+    }
+    if (this->data_type != DT) {
+      throw mk_runtime_error(
+          "Invalid access data type ({} != {})", this->data_type, DT);
+    }
+
+    using T = real_type_t<DT>;
+
+    T *data_ptr = static_cast<T *>(this->ptr);
+    size_t offset = calculate_index_offset({static_cast<size_t>(indices)...});
+
+    return data_ptr[offset];
+  }
+
+  template <DataType DT, typename... Indices>
+  real_type_t<DT> const &at(Indices... indices) const {
+    if (this->device_type != DeviceType::CPU) {
+      throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
+    }
+    if (this->data_type != DT) {
+      throw mk_runtime_error(
+          "Invalid access data type ({} != {})", this->data_type, DT);
+    }
+
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = static_cast<T const *>(this->ptr);
+    size_t offset = calculate_index_offset({static_cast<size_t>(indices)...});
+
+    return data_ptr[offset];
+  }
 
 public:
   DataType data_type;
   ArrayShape shape;
-  req<void *> ptr;
-  bool on_device;
+  void *ptr;
+  DeviceType device_type;
+
+private:
+  std::tuple<decltype(data_type) const &,
+             decltype(shape) const &,
+             decltype(ptr) const &,
+             decltype(device_type) const &>
+      tie() const;
+
+  size_t calculate_index_offset(
+      std::initializer_list<size_t> const &indices) const;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(
-    GenericTensorAccessorW, data_type, shape, ptr, on_device);
 
 std::string format_as(GenericTensorAccessorW const &);
 std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
@@ -64,20 +110,50 @@ class GenericTensorAccessorR {
   double const *get_double_ptr() const;
   half const *get_half_ptr() const;
 
-  GenericTensorAccessorR(DataType dt,
-                         ArrayShape sh,
-                         req<void const *> p,
-                         bool on_dev = true)
-      : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {}
+  GenericTensorAccessorR() = delete;
+
+  GenericTensorAccessorR(DataType data_type,
+                         ArrayShape const &shape,
+                         void const *ptr,
+                         DeviceType device_type);
+
+  bool operator==(GenericTensorAccessorR const &) const;
+  bool operator!=(GenericTensorAccessorR const &) const;
+
+  template <DataType DT, typename... Indices>
+  real_type_t<DT> const &at(Indices... indices) const {
+    if (this->device_type != DeviceType::CPU) {
+      throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
+    }
+    if (this->data_type != DT) {
+      throw mk_runtime_error(
+          "Invalid access data type ({} != {})", this->data_type, DT);
+    }
+
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = static_cast<T const *>(this->ptr);
+    size_t offset = calculate_index_offset({static_cast<size_t>(indices)...});
+
+    return data_ptr[offset];
+  }
 
 public:
   DataType data_type;
   ArrayShape shape;
-  req<void const *> ptr;
-  bool on_device;
+  void const *ptr;
+  DeviceType device_type;
+
+private:
+  std::tuple<decltype(data_type) const &,
+             decltype(shape) const &,
+             decltype(ptr) const &,
+             decltype(device_type) const &>
+      tie() const;
+
+  size_t calculate_index_offset(
+      std::initializer_list<size_t> const &indices) const;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(
-    GenericTensorAccessorR, data_type, shape, ptr, on_device);
 
 std::string format_as(GenericTensorAccessorR const &);
 std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h
index 452ccc47b0..893be513ea 100644
--- a/lib/kernels/include/kernels/allocation.h
+++ b/lib/kernels/include/kernels/allocation.h
@@ -5,15 +5,14 @@
 #include <cstddef>
 #include <memory>
 
-enum class AllocLocation { HOST, DEVICE };
-
 namespace FlexFlow {
 
 struct IAllocator {
   virtual void *allocate(size_t) = 0;
-  virtual void *allocate_and_zero(size_t) = 0;
   virtual void deallocate(void *) = 0;
 
+  virtual DeviceType get_allocation_device_type() const = 0;
+
   virtual ~IAllocator() = default;
 };
 
@@ -21,13 +20,12 @@ struct Allocator {
   Allocator() = delete;
 
   GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape);
-  GenericTensorAccessorW
-      allocate_tensor_and_zero(TensorShape const &tensor_shape);
 
   void *allocate(size_t mem_size);
-  void *allocate_and_zero(size_t mem_size);
   void deallocate(void *ptr);
 
+  DeviceType get_allocation_device_type() const;
+
   template <typename T, typename... Args>
   static typename std::enable_if<std::is_base_of<IAllocator, T>::value,
                                  Allocator>::type
@@ -37,8 +35,6 @@ struct Allocator {
 
   Allocator(std::shared_ptr<IAllocator> ptr) : i_allocator(ptr){};
 
-  AllocLocation alloc_location;
-
 private:
   std::shared_ptr<IAllocator> i_allocator;
 };
diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h
index eb5a1b8198..1e483102dd 100644
--- a/lib/kernels/include/kernels/attention_kernels.h
+++ b/lib/kernels/include/kernels/attention_kernels.h
@@ -64,8 +64,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState,
 std::string format_as(MHAPerDeviceState const &x);
 std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x);
 
-namespace Kernels {
-namespace MultiHeadAttention {
+namespace Kernels::MultiHeadAttention {
 
 MHAPerDeviceState init_kernel(PerDeviceFFHandle const &,
                               Allocator &,
@@ -105,8 +104,7 @@ void backward_kernel(ffStream_t stream,
 void cleanup_kernel(Allocator &allocator,
                     MHAPerDeviceState const &device_state);
 
-} // namespace MultiHeadAttention
-} // namespace Kernels
+} // namespace Kernels::MultiHeadAttention
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h
index bfd72647b0..bde91bea15 100644
--- a/lib/kernels/include/kernels/batch_matmul_kernels.h
+++ b/lib/kernels/include/kernels/batch_matmul_kernels.h
@@ -5,9 +5,7 @@
 #include "kernels/allocation.h"
 #include "kernels/ff_handle.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace BatchMatmul {
+namespace FlexFlow::Kernels::BatchMatmul {
 
 void forward_kernel(ffStream_t stream,
                     PerDeviceFFHandle const &handle,
@@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream,
                      int k,
                      int batch);
 
-} // namespace BatchMatmul
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::BatchMatmul
 
 #endif
diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h
index 7d533d672c..4de6ac6af0 100644
--- a/lib/kernels/include/kernels/batch_norm_kernels.h
+++ b/lib/kernels/include/kernels/batch_norm_kernels.h
@@ -43,8 +43,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(BatchNormPerDeviceState,
                                              output_w,
                                              relu);
 
-namespace Kernels {
-namespace BatchNorm {
+namespace Kernels::BatchNorm {
 
 BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                     Allocator allocator,
@@ -81,8 +80,7 @@ void cleanup_kernel(Allocator allocator,
                     bool relu,
                     float *runningMean);
 
-} // namespace BatchNorm
-} // namespace Kernels
+} // namespace Kernels::BatchNorm
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h
index 502a823ca7..f67613cec6 100644
--- a/lib/kernels/include/kernels/cast_kernels.h
+++ b/lib/kernels/include/kernels/cast_kernels.h
@@ -4,9 +4,7 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Cast {
+namespace FlexFlow::Kernels::Cast {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
@@ -20,8 +18,6 @@ void backward_kernel(ffStream_t stream,
                      DataType input_type,
                      DataType output_type);
 
-} // namespace Cast
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Cast
 
 #endif
diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h
index cae0c9da8d..959617dcae 100644
--- a/lib/kernels/include/kernels/cast_kernels_cpu.h
+++ b/lib/kernels/include/kernels/cast_kernels_cpu.h
@@ -4,9 +4,7 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Cast {
+namespace FlexFlow::Kernels::Cast {
 
 void cpu_forward_kernel(GenericTensorAccessorR const &input,
                         GenericTensorAccessorW const &output,
@@ -18,8 +16,6 @@ void cpu_backward_kernel(GenericTensorAccessorR const &input,
                          DataType input_type,
                          DataType output_type);
 
-} // namespace Cast
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Cast
 
 #endif
diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h
index eb263e0734..50de18e823 100644
--- a/lib/kernels/include/kernels/combine_kernels.h
+++ b/lib/kernels/include/kernels/combine_kernels.h
@@ -4,9 +4,7 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Combine {
+namespace FlexFlow::Kernels::Combine {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
@@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR const &output_grad,
                      GenericTensorAccessorW const &input_grad);
 
-} // namespace Combine
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Combine
 
 #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H
diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h
index 66c22ddbf8..430c7cf906 100644
--- a/lib/kernels/include/kernels/combine_kernels_cpu.h
+++ b/lib/kernels/include/kernels/combine_kernels_cpu.h
@@ -4,9 +4,7 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Combine {
+namespace FlexFlow::Kernels::Combine {
 
 void cpu_forward_kernel(GenericTensorAccessorR const &input,
                         GenericTensorAccessorW const &output);
@@ -14,8 +12,6 @@ void cpu_forward_kernel(GenericTensorAccessorR const &input,
 void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
                          GenericTensorAccessorW const &input_grad);
 
-} // namespace Combine
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Combine
 
 #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h
index a44affc1f2..33355296dd 100644
--- a/lib/kernels/include/kernels/concat_kernels.h
+++ b/lib/kernels/include/kernels/concat_kernels.h
@@ -4,9 +4,7 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Concat {
+namespace FlexFlow::Kernels::Concat {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorW const &output,
@@ -18,8 +16,6 @@ void backward_kernel(ffStream_t stream,
                      std::vector<GenericTensorAccessorW> const &input_grads,
                      ff_dim_t axis);
 
-} // namespace Concat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Concat
 
 #endif
diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h
index cfc64f963d..217751e191 100644
--- a/lib/kernels/include/kernels/conv_2d_kernels.h
+++ b/lib/kernels/include/kernels/conv_2d_kernels.h
@@ -34,8 +34,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState,
                                              bwdFilterAlgo,
                                              bwdDataAlgo);
 
-namespace Kernels {
-namespace Conv2D {
+namespace Kernels::Conv2D {
 
 Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  std::optional<Activation> activation,
@@ -70,8 +69,7 @@ void backward_kernel(ffStream_t stream,
                      float *bias_grad_ptr,
                      std::optional<Activation> activation);
 
-} // namespace Conv2D
-} // namespace Kernels
+} // namespace Kernels::Conv2D
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H
diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h
index e83fc3325d..0986d99791 100644
--- a/lib/kernels/include/kernels/datatype_dispatch.h
+++ b/lib/kernels/include/kernels/datatype_dispatch.h
@@ -1,7 +1,8 @@
 #ifndef _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H
 #define _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H
 
-#include "accessor.h"
+#include "op-attrs/datatype.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
diff --git a/lib/kernels/include/kernels/dropout_kernels.h b/lib/kernels/include/kernels/dropout_kernels.h
index c0e503be5b..4790540098 100644
--- a/lib/kernels/include/kernels/dropout_kernels.h
+++ b/lib/kernels/include/kernels/dropout_kernels.h
@@ -31,8 +31,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(DropoutPerDeviceState,
                                              reserveSpaceSize,
                                              dropoutStateSize);
 
-namespace Kernels {
-namespace Dropout {
+namespace Kernels::Dropout {
 
 DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                   float rate,
@@ -56,8 +55,7 @@ void cleanup_kernel(Allocator allocator,
                     ffDropoutDescriptor_t dropoutDesc,
                     void *dropoutStates);
 
-} // namespace Dropout
-} // namespace Kernels
+} // namespace Kernels::Dropout
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H
diff --git a/lib/kernels/include/kernels/element_binary_kernels.h b/lib/kernels/include/kernels/element_binary_kernels.h
index 41447e98e6..1017230fb0 100644
--- a/lib/kernels/include/kernels/element_binary_kernels.h
+++ b/lib/kernels/include/kernels/element_binary_kernels.h
@@ -26,8 +26,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ElementBinaryPerDeviceState,
                                              opDesc,
                                              reduceAddDesc);
 
-namespace Kernels {
-namespace ElementBinary {
+namespace Kernels::ElementBinary {
 
 ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                         OperatorType op_type,
@@ -58,8 +57,7 @@ void backward_kernel(ffStream_t stream,
                      bool broadcast_inputRHS,
                      PerDeviceFFHandle handle);
 
-} // namespace ElementBinary
-} // namespace Kernels
+} // namespace Kernels::ElementBinary
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h
index 8c6864b2d9..26ce4ecaec 100644
--- a/lib/kernels/include/kernels/element_unary_kernels.h
+++ b/lib/kernels/include/kernels/element_unary_kernels.h
@@ -19,8 +19,7 @@ FF_VISITABLE_STRUCT_NO_EQ(ElementUnaryPerDeviceState,
                           outputTensor,
                           actiDesc);
 
-namespace Kernels {
-namespace ElementUnary {
+namespace Kernels::ElementUnary {
 
 ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape,
                                        ArrayShape const &output_shape,
@@ -42,8 +41,7 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR const &output,
                      GenericTensorAccessorR const &output_grad);
 
-} // namespace ElementUnary
-} // namespace Kernels
+} // namespace Kernels::ElementUnary
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h
index 06582ca1d5..6d5141f489 100644
--- a/lib/kernels/include/kernels/embedding_kernels.h
+++ b/lib/kernels/include/kernels/embedding_kernels.h
@@ -5,9 +5,7 @@
 #include "kernels/accessor.h"
 #include "op-attrs/ops/embedding.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Embedding {
+namespace FlexFlow::Kernels::Embedding {
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output,
@@ -35,8 +33,6 @@ void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p);
 template <typename TD>
 __global__ void rand_generate_int(TD *ptr, size_t size, TD p);
 
-} // namespace Embedding
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Embedding
 
 #endif // _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H
diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h
index 3e600c48de..41b411c937 100644
--- a/lib/kernels/include/kernels/flat_kernels.h
+++ b/lib/kernels/include/kernels/flat_kernels.h
@@ -4,9 +4,7 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Flat {
+namespace FlexFlow::Kernels::Flat {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR input,
@@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream,
                      float *input_grad_ptr,
                      float const *output_grad_ptr);
 
-} // namespace Flat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Flat
 
 #endif // _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H
diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h
index 13bf4b898a..af2da3b11f 100644
--- a/lib/kernels/include/kernels/gather_kernels.h
+++ b/lib/kernels/include/kernels/gather_kernels.h
@@ -15,8 +15,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState,
                                              handle,
                                              legion_dim);
 
-namespace Kernels {
-namespace Gather {
+namespace Kernels::Gather {
 
 void forward_kernel(ffStream_t stream,
                     GatherPerDeviceState const &m,
@@ -30,8 +29,7 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR const &index,
                      GenericTensorAccessorW const &input_grad);
 
-} // namespace Gather
-} // namespace Kernels
+} // namespace Kernels::Gather
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h
index be13d32879..a6ae87442a 100644
--- a/lib/kernels/include/kernels/layer_norm_kernels.h
+++ b/lib/kernels/include/kernels/layer_norm_kernels.h
@@ -30,8 +30,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState,
                                              bias,
                                              data_type);
 
-namespace Kernels {
-namespace LayerNorm {
+namespace Kernels::LayerNorm {
 
 // todo: this may have some problem.
 LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
@@ -57,8 +56,7 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorW const &gamma_grad,
                      GenericTensorAccessorW const &beta_grad);
 
-} // namespace LayerNorm
-} // namespace Kernels
+} // namespace Kernels::LayerNorm
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h
index 3128e39fd0..99549adece 100644
--- a/lib/kernels/include/kernels/linear_kernels.h
+++ b/lib/kernels/include/kernels/linear_kernels.h
@@ -33,8 +33,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LinearPerDeviceState,
                                              weight_type,
                                              output_type);
 
-namespace Kernels {
-namespace Linear {
+namespace Kernels::Linear {
 
 LinearPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  float *one_ptr,
@@ -72,8 +71,7 @@ void backward_kernel(ffStream_t stream,
                      int out_dim,
                      int batch_size);
 
-} // namespace Linear
-} // namespace Kernels
+} // namespace Kernels::Linear
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h
index 121ed184e9..cf6cfe35d1 100644
--- a/lib/kernels/include/kernels/local_cpu_allocator.h
+++ b/lib/kernels/include/kernels/local_cpu_allocator.h
@@ -7,14 +7,15 @@ struct LocalCPUAllocator : public IAllocator {
   LocalCPUAllocator() = default;
   LocalCPUAllocator(LocalCPUAllocator const &) = delete;
   LocalCPUAllocator(LocalCPUAllocator &&) = delete;
-  ~LocalCPUAllocator() override;
+  ~LocalCPUAllocator() = default;
 
   void *allocate(size_t) override;
-  void *allocate_and_zero(size_t) override;
   void deallocate(void *) override;
 
+  DeviceType get_allocation_device_type() const override;
+
 private:
-  std::unordered_set<void *> ptrs;
+  std::unordered_map<void *, std::unique_ptr<void, decltype(&free)>> ptrs;
 };
 CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator);
 
diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h
index 16f60daead..b8e0540974 100644
--- a/lib/kernels/include/kernels/local_cuda_allocator.h
+++ b/lib/kernels/include/kernels/local_cuda_allocator.h
@@ -10,9 +10,10 @@ struct LocalCudaAllocator : public IAllocator {
   ~LocalCudaAllocator() override;
 
   void *allocate(size_t) override;
-  void *allocate_and_zero(size_t) override;
   void deallocate(void *) override;
 
+  DeviceType get_allocation_device_type() const override;
+
 private:
   std::unordered_set<void *> ptrs;
 };
diff --git a/lib/kernels/include/kernels/nccl.h b/lib/kernels/include/kernels/nccl.h
index b8a6784676..042911d172 100644
--- a/lib/kernels/include/kernels/nccl.h
+++ b/lib/kernels/include/kernels/nccl.h
@@ -23,15 +23,11 @@ struct ncclUniqueId {};
 struct ncclComm_t {};
 #endif
 
-namespace FlexFlow {
-namespace Kernels {
-namespace NCCL {
+namespace FlexFlow::Kernels::NCCL {
 
 ncclUniqueId generate_unique_id();
 ncclComm_t create_comm(ncclUniqueId const &, int num_ranks, int my_rank);
 
-} // namespace NCCL
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::NCCL
 
 #endif
diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h
index 64ef1a1352..e580c4a9de 100644
--- a/lib/kernels/include/kernels/partition_kernels.h
+++ b/lib/kernels/include/kernels/partition_kernels.h
@@ -13,8 +13,7 @@ struct RepartitionPerDeviceState {
 
 FF_VISITABLE_STRUCT_NO_EQ(RepartitionPerDeviceState, handle, data_type);
 
-namespace Kernels {
-namespace Repartition {
+namespace Kernels::Repartition {
 
 RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
                                       DataType data_type);
@@ -29,8 +28,7 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorW const &output_grad,
                      GenericTensorAccessorR const &input_grad);
 
-} // namespace Repartition
-} // namespace Kernels
+} // namespace Kernels::Repartition
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H
diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h
index 798c0507f8..191c23bc98 100644
--- a/lib/kernels/include/kernels/pool_2d_kernels.h
+++ b/lib/kernels/include/kernels/pool_2d_kernels.h
@@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Pool2DPerDeviceState,
                                              poolDesc,
                                              relu);
 
-namespace Kernels {
-namespace Pool2D {
+namespace Kernels::Pool2D {
 
 Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  std::optional<Activation> activation,
@@ -75,8 +74,7 @@ void backward_kernel(ffStream_t stream,
                      void const *output_ptr,
                      void const *output_grad_ptr);
 
-} // namespace Pool2D
-} // namespace Kernels
+} // namespace Kernels::Pool2D
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H
diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h
index 4287472875..cd3930ea1c 100644
--- a/lib/kernels/include/kernels/reduce_kernels.h
+++ b/lib/kernels/include/kernels/reduce_kernels.h
@@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT(ReducePerDeviceState,
                     op_type,
                     reduction_size);
 
-namespace Kernels {
-namespace Reduce {
+namespace Kernels::Reduce {
 
 ReducePerDeviceState init_kernel(PerDeviceFFHandle const &,
                                  OperatorType const &,
@@ -43,8 +42,7 @@ void backward_kernel(ffStream_t stream,
                      ReducePerDeviceState const &m,
                      float const *output_grad_ptr,
                      float *input_grad_ptr);
-} // namespace Reduce
-} // namespace Kernels
+} // namespace Kernels::Reduce
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h
index fb3baf215c..7e1e240ea4 100644
--- a/lib/kernels/include/kernels/reduction_kernels.h
+++ b/lib/kernels/include/kernels/reduction_kernels.h
@@ -4,9 +4,7 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Reduction {
+namespace FlexFlow::Kernels::Reduction {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
@@ -17,8 +15,6 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output);
 
-} // namespace Reduction
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Reduction
 
 #endif // _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H
diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h
index 409fc81f44..877eeabf04 100644
--- a/lib/kernels/include/kernels/replicate_kernels.h
+++ b/lib/kernels/include/kernels/replicate_kernels.h
@@ -4,9 +4,7 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Replicate {
+namespace FlexFlow::Kernels::Replicate {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
@@ -17,8 +15,6 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR const &output,
                      size_t num_replicas);
 
-} // namespace Replicate
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Replicate
 
 #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H
diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h
index 11d2f1bf5c..a72b799875 100644
--- a/lib/kernels/include/kernels/replicate_kernels_cpu.h
+++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h
@@ -4,9 +4,7 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Replicate {
+namespace FlexFlow::Kernels::Replicate {
 
 void cpu_forward_kernel(GenericTensorAccessorR const &input,
                         GenericTensorAccessorW const &output);
@@ -15,8 +13,6 @@ void cpu_backward_kernel(GenericTensorAccessorW const &input,
                          GenericTensorAccessorR const &output,
                          size_t num_replicas);
 
-} // namespace Replicate
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Replicate
 
 #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h
index a83caa6bea..5fa4382c43 100644
--- a/lib/kernels/include/kernels/reshape_kernels.h
+++ b/lib/kernels/include/kernels/reshape_kernels.h
@@ -13,8 +13,7 @@ struct ReshapePerDeviceState {
 
 FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type);
 
-namespace Kernels {
-namespace Reshape {
+namespace Kernels::Reshape {
 
 ReshapePerDeviceState init_kernel(DataType data_type);
 
@@ -28,8 +27,7 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output);
 
-} // namespace Reshape
-} // namespace Kernels
+} // namespace Kernels::Reshape
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reverse_kernels.h b/lib/kernels/include/kernels/reverse_kernels.h
index 42a83ae219..deb5b22155 100644
--- a/lib/kernels/include/kernels/reverse_kernels.h
+++ b/lib/kernels/include/kernels/reverse_kernels.h
@@ -3,9 +3,7 @@
 
 #include "device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Reverse {
+namespace FlexFlow::Kernels::Reverse {
 
 void forward_kernel(ffStream_t stream,
                     float const *in_ptr,
@@ -23,8 +21,6 @@ void backward_kernel(ffStream_t stream,
                      coord_t in_blk_size,
                      coord_t input_size);
 
-} // namespace Reverse
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Reverse
 
 #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h
index bb17aa9400..b0edaa264c 100644
--- a/lib/kernels/include/kernels/reverse_kernels_cpu.h
+++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h
@@ -1,27 +1,22 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
 #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
 
+#include "accessor.h"
 #include "device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Reverse {
+namespace FlexFlow::Kernels::Reverse {
 
-void cpu_forward_kernel(float const *in_ptr,
-                        float *out_ptr,
+void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
+                        GenericTensorAccessorW &output_accessor,
                         coord_t num_out_blks,
                         coord_t reverse_dim_size,
-                        coord_t in_blk_size,
-                        coord_t output_size);
+                        coord_t in_blk_size);
 
-void cpu_backward_kernel(float const *out_grad_ptr,
-                         float *in_grad_ptr,
+void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor,
+                         GenericTensorAccessorW &input_accessor,
                          coord_t num_out_blks,
                          coord_t reverse_dim_size,
-                         coord_t in_blk_size,
-                         coord_t input_size);
-} // namespace Reverse
-} // namespace Kernels
-} // namespace FlexFlow
+                         coord_t in_blk_size);
+} // namespace FlexFlow::Kernels::Reverse
 
 #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h
index 061230ec52..93135cb648 100644
--- a/lib/kernels/include/kernels/softmax_kernels.h
+++ b/lib/kernels/include/kernels/softmax_kernels.h
@@ -15,8 +15,7 @@ struct SoftmaxPerDeviceState {
 
 FF_VISITABLE_STRUCT(SoftmaxPerDeviceState, handle, inputTensor, dim);
 
-namespace Kernels {
-namespace Softmax {
+namespace Kernels::Softmax {
 
 SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
                                   int dim,
@@ -35,8 +34,7 @@ void backward_kernel(ffStream_t stream,
                      float const *output_grad_ptr,
                      size_t num_elements);
 
-} // namespace Softmax
-} // namespace Kernels
+} // namespace Kernels::Softmax
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/split_kernels.h b/lib/kernels/include/kernels/split_kernels.h
index 36434d4be8..538b9602c2 100644
--- a/lib/kernels/include/kernels/split_kernels.h
+++ b/lib/kernels/include/kernels/split_kernels.h
@@ -3,10 +3,7 @@
 
 #include "device.h"
 
-namespace FlexFlow {
-
-namespace Kernels {
-namespace Split {
+namespace FlexFlow::Kernels::Split {
 void forward_kernel(ffStream_t stream,
                     float **out_ptrs,
                     float const *in_ptr,
@@ -22,8 +19,6 @@ void backward_kernel(ffStream_t stream,
                      coord_t num_blks,
                      int numOutputs);
 
-} // namespace Split
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Split
 
 #endif // _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H
diff --git a/lib/kernels/include/kernels/topk_kernels.h b/lib/kernels/include/kernels/topk_kernels.h
index ae1c739f6c..6f33381e1a 100644
--- a/lib/kernels/include/kernels/topk_kernels.h
+++ b/lib/kernels/include/kernels/topk_kernels.h
@@ -12,8 +12,7 @@ struct TopKPerDeviceState {
 
 FF_VISITABLE_STRUCT(TopKPerDeviceState, sorted);
 
-namespace Kernels {
-namespace TopK {
+namespace Kernels::TopK {
 
 TopKPerDeviceState init_kernel(bool sorted);
 
@@ -35,8 +34,7 @@ void backward_kernel(ffStream_t stream,
                      int length,
                      int k);
 
-} // namespace TopK
-} // namespace Kernels
+} // namespace Kernels::TopK
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H
diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h
index 56da81ba2b..b48b7e0aa8 100644
--- a/lib/kernels/include/kernels/transpose_kernels.h
+++ b/lib/kernels/include/kernels/transpose_kernels.h
@@ -16,8 +16,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(TransposePerDeviceState,
                                              num_dim,
                                              perm);
 
-namespace Kernels {
-namespace Transpose {
+namespace Kernels::Transpose {
 
 TransposePerDeviceState init_kernel(int num_dim,
                                     std::vector<ff_dim_t> const &perm);
@@ -32,8 +31,7 @@ void backward_kernel(cudaStream_t stream,
                      GenericTensorAccessorW const &in_grad,
                      GenericTensorAccessorR const &out_grad);
 
-} // namespace Transpose
-} // namespace Kernels
+} // namespace Kernels::Transpose
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
index 66d3c02300..c0b11a2299 100644
--- a/lib/kernels/src/accessor.cc
+++ b/lib/kernels/src/accessor.cc
@@ -2,6 +2,64 @@
 
 namespace FlexFlow {
 
+GenericTensorAccessorW::GenericTensorAccessorW(
+    DataType data_type,
+    ArrayShape const &shape,
+    void *ptr,
+    DeviceType device_type = DeviceType::GPU)
+    : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {}
+
+std::tuple<DataType const &,
+           ArrayShape const &,
+           void *const &,
+           DeviceType const &>
+    GenericTensorAccessorW::tie() const {
+  return std::tie(this->data_type, this->shape, this->ptr, this->device_type);
+}
+
+size_t GenericTensorAccessorW::calculate_index_offset(
+    std::initializer_list<size_t> const &indices) const {
+
+  if (indices.size() != this->shape.num_dims()) {
+    throw mk_runtime_error(
+        "Number of indices ({}) does not match the number of dimensions ({}).",
+        indices.size(),
+        this->shape.num_dims());
+  }
+
+  size_t offset = 0;
+  size_t multiplier = 1;
+  size_t cur_idx;
+  auto it = indices.end() - 1;
+
+  for (std::size_t i = this->shape.num_dims(); i-- > 0;) {
+    cur_idx = *it--;
+
+    if (cur_idx >= this->shape[legion_dim_t(i)]) {
+      throw mk_runtime_error("In {} dimension, attempting to access index {} "
+                             "when only {} indexes exist",
+                             i,
+                             cur_idx,
+                             this->shape[legion_dim_t(i)]);
+    }
+
+    offset += cur_idx * multiplier;
+    multiplier *= this->shape[legion_dim_t(i)];
+  }
+
+  return offset;
+}
+
+bool GenericTensorAccessorW::operator==(
+    GenericTensorAccessorW const &other) const {
+  return this->tie() == other.tie();
+}
+
+bool GenericTensorAccessorW::operator!=(
+    GenericTensorAccessorW const &other) const {
+  return this->tie() != other.tie();
+}
+
 int32_t *GenericTensorAccessorW::get_int32_ptr() const {
   return this->get<DataType::INT32>();
 }
@@ -33,6 +91,64 @@ std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) {
   return (s << fmt::to_string(a));
 }
 
+GenericTensorAccessorR::GenericTensorAccessorR(
+    DataType data_type,
+    ArrayShape const &shape,
+    void const *ptr,
+    DeviceType device_type = DeviceType::GPU)
+    : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {}
+
+std::tuple<DataType const &,
+           ArrayShape const &,
+           void const *const &,
+           DeviceType const &>
+    GenericTensorAccessorR::tie() const {
+  return std::tie(this->data_type, this->shape, this->ptr, this->device_type);
+}
+
+size_t GenericTensorAccessorR::calculate_index_offset(
+    std::initializer_list<size_t> const &indices) const {
+
+  if (indices.size() != this->shape.num_dims()) {
+    throw mk_runtime_error(
+        "Number of indices ({}) does not match the number of dimensions ({}).",
+        indices.size(),
+        this->shape.num_dims());
+  }
+
+  size_t offset = 0;
+  size_t multiplier = 1;
+  size_t cur_idx;
+  auto it = indices.end() - 1;
+
+  for (std::size_t i = this->shape.num_dims(); i-- > 0;) {
+    cur_idx = *it--;
+
+    if (cur_idx >= this->shape[legion_dim_t(i)]) {
+      throw mk_runtime_error("In {} dimension, attempting to access index {} "
+                             "when only {} indexes exist",
+                             i,
+                             cur_idx,
+                             this->shape[legion_dim_t(i)]);
+    }
+
+    offset += cur_idx * multiplier;
+    multiplier *= this->shape[legion_dim_t(i)];
+  }
+
+  return offset;
+}
+
+bool GenericTensorAccessorR::operator==(
+    GenericTensorAccessorR const &other) const {
+  return this->tie() == other.tie();
+}
+
+bool GenericTensorAccessorR::operator!=(
+    GenericTensorAccessorR const &other) const {
+  return this->tie() != other.tie();
+}
+
 int32_t const *GenericTensorAccessorR::get_int32_ptr() const {
   return this->get<DataType::INT32>();
 }
@@ -159,7 +275,7 @@ GenericTensorAccessorR read_only_accessor_from_write_accessor(
   return GenericTensorAccessorR{writable.data_type,
                                 writable.shape,
                                 req<void const *>(writable.ptr),
-                                writable.on_device};
+                                writable.device_type};
 }
 
 bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
index ce06fbabe0..751cdc0ebb 100644
--- a/lib/kernels/src/allocation.cc
+++ b/lib/kernels/src/allocation.cc
@@ -7,26 +7,19 @@ void *Allocator::allocate(size_t mem_size) {
   return this->i_allocator->allocate(mem_size);
 }
 
-void *Allocator::allocate_and_zero(size_t mem_size) {
-  return this->i_allocator->allocate_and_zero(mem_size);
-}
-
 void Allocator::deallocate(void *ptr) {
   this->i_allocator->deallocate(ptr);
 }
 
-GenericTensorAccessorW
-    Allocator::allocate_tensor(TensorShape const &tensor_shape) {
-  void *ptr = this->allocate(get_size_in_bytes(tensor_shape));
-  bool on_device = this->alloc_location == AllocLocation::DEVICE;
-  return {tensor_shape.data_type, tensor_shape, ptr, on_device};
+DeviceType Allocator::get_allocation_device_type() const {
+  return this->i_allocator->get_allocation_device_type();
 }
 
 GenericTensorAccessorW
-    Allocator::allocate_tensor_and_zero(TensorShape const &tensor_shape) {
-  void *ptr = this->allocate_and_zero(get_size_in_bytes(tensor_shape));
-  bool on_device = this->alloc_location == AllocLocation::DEVICE;
-  return {tensor_shape.data_type, tensor_shape, ptr, on_device};
+    Allocator::allocate_tensor(TensorShape const &tensor_shape) {
+  void *ptr = this->allocate(get_size_in_bytes(tensor_shape));
+  return {
+      tensor_shape.data_type, tensor_shape, ptr, get_allocation_device_type()};
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index d5e2f1167d..5c18a9ab5a 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -53,6 +53,7 @@ std::size_t ArrayShape::at(ff_dim_t idx) const {
 ArrayShape ArrayShape::sub_shape(
     std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
     std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
+
   NOT_IMPLEMENTED();
 }
 
diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc
index 5888d9a96a..2d3f440c75 100644
--- a/lib/kernels/src/cpu/cast_kernels.cc
+++ b/lib/kernels/src/cpu/cast_kernels.cc
@@ -1,9 +1,7 @@
 #include "kernels/cast_kernels_cpu.h"
 #include "kernels/datatype_dispatch.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Cast {
+namespace FlexFlow::Kernels::Cast {
 
 template <typename IDT, typename ODT>
 void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) {
@@ -53,6 +51,4 @@ void cpu_backward_kernel(GenericTensorAccessorR const &input,
       input_type, output_type, input, output);
 }
 
-} // namespace Cast
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Cast
diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc
index e48f4c3e01..d0be1f9f2d 100644
--- a/lib/kernels/src/cpu/combine_kernels.cc
+++ b/lib/kernels/src/cpu/combine_kernels.cc
@@ -1,9 +1,7 @@
 #include "kernels/combine_kernels_cpu.h"
 #include "kernels/datatype_dispatch.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Combine {
+namespace FlexFlow::Kernels::Combine {
 
 template <DataType DT>
 struct CPUForwardKernel {
@@ -37,6 +35,4 @@ void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
       input_grad.data_type, output_grad, input_grad);
 }
 
-} // namespace Combine
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Combine
diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc
index 239baf4041..5853869047 100644
--- a/lib/kernels/src/cpu/replicate_kernels.cc
+++ b/lib/kernels/src/cpu/replicate_kernels.cc
@@ -1,26 +1,22 @@
 #include "kernels/datatype_dispatch.h"
 #include "kernels/replicate_kernels_cpu.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Replicate {
+namespace FlexFlow::Kernels::Replicate {
 
 template <typename T>
 void cpu_replicate_backward_kernel(T *input,
                                    T const *output,
                                    size_t num_elements,
                                    size_t num_replicas) {
-  for (size_t i = 0; i < num_elements; ++i) {
+  for (size_t i = 0; i < num_elements; i++) {
     T sum = 0;
-    for (size_t j = 0; j < num_replicas; ++j) {
+    for (size_t j = 0; j < num_replicas; j++) {
       sum += output[i + j * num_elements];
     }
     input[i] = sum;
   }
 }
 
-// Why does replicate forward seem to only transfer memory? Shouldn't it also
-// handle the replication?
 template <DataType T>
 struct CPUForwardKernel {
   void operator()(GenericTensorAccessorR const &input,
@@ -36,9 +32,10 @@ struct CPUBackwardKernel {
   void operator()(GenericTensorAccessorW const &input,
                   GenericTensorAccessorR const &output,
                   size_t num_replicas) {
-    size_t total_elements = input.shape.num_elements() * num_replicas;
-    cpu_replicate_backward_kernel(
-        input.get<T>(), output.get<T>(), total_elements, num_replicas);
+    cpu_replicate_backward_kernel(input.get<T>(),
+                                  output.get<T>(),
+                                  input.shape.num_elements(),
+                                  num_replicas);
   }
 };
 
@@ -54,6 +51,4 @@ void cpu_backward_kernel(GenericTensorAccessorW const &input,
       input.data_type, input, output, num_replicas);
 }
 
-} // namespace Replicate
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Replicate
diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc
index 350dad03e9..1971435d8c 100644
--- a/lib/kernels/src/cpu/reverse_kernels.cc
+++ b/lib/kernels/src/cpu/reverse_kernels.cc
@@ -2,77 +2,66 @@
 #include <algorithm>
 #include <vector>
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Reverse {
+namespace FlexFlow::Kernels::Reverse {
 
-void cpu_reverse_forward_kernel(float const *in_ptr,
-                                float *out_ptr,
-                                coord_t num_out_blks,
-                                coord_t reverse_dim_size,
-                                coord_t in_blk_size) {
-  coord_t total_elements = num_out_blks * reverse_dim_size * in_blk_size;
+template <DataType DT>
+struct CPUReverseForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW &output,
+                  coord_t num_out_blks,
+                  coord_t reverse_dim_size,
+                  coord_t in_blk_size) {
+    assert(input.data_type == DT && output.data_type == DT);
 
-  std::vector<std::vector<float>> in_blocks(num_out_blks * reverse_dim_size,
-                                            std::vector<float>(in_blk_size));
-
-  // For each output block, copy the input block into in_blocks
-  for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) {
-    // Each output block has reverse_dim_size input blocks
-    for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) {
-      coord_t start_idx = (blk_idx * reverse_dim_size + rev_idx) * in_blk_size;
-
-      // Copy elements from in_ptr to the current block in in_blocks
-      std::vector<float> &current_block =
-          in_blocks[blk_idx * reverse_dim_size + rev_idx];
-      for (coord_t i = 0; i < in_blk_size; ++i) {
-        current_block[i] = in_ptr[start_idx + i];
+    // For each output block, copy the input block
+    for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) {
+      for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) {
+        for (coord_t i = 0; i < in_blk_size; ++i) {
+          output.at<DT>(blk_idx, rev_idx, i) =
+              input.at<DT>(blk_idx, rev_idx, i);
+        }
       }
     }
-  }
 
-  // Reverse the in_blocks within each output block
-  for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) {
-    auto block_start = in_blocks.begin() + blk_idx * reverse_dim_size;
-    auto block_end = block_start + reverse_dim_size;
-    std::reverse(block_start, block_end);
-  }
-
-  // Copy the reversed blocks to the output array
-  for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) {
-    for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) {
-      coord_t start_idx = (blk_idx * reverse_dim_size + rev_idx) * in_blk_size;
+    // Reverse the blocks within each output block
+    for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) {
+      for (coord_t rev_idx = 0; rev_idx < reverse_dim_size / 2; ++rev_idx) {
+        coord_t start_idx = rev_idx;
+        coord_t end_idx = reverse_dim_size - 1 - rev_idx;
 
-      // Copy elements from the current block in in_blocks to out_ptr
-      std::vector<float> const &current_block =
-          in_blocks[blk_idx * reverse_dim_size + rev_idx];
-      for (coord_t i = 0; i < in_blk_size; ++i) {
-        out_ptr[start_idx + i] = current_block[i];
+        for (coord_t i = 0; i < in_blk_size; ++i) {
+          std::swap(output.at<DT>(blk_idx, start_idx, i),
+                    output.at<DT>(blk_idx, end_idx, i));
+        }
       }
     }
   }
-}
+};
 
-void cpu_forward_kernel(float const *in_ptr,
-                        float *out_ptr,
+void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
+                        GenericTensorAccessorW &output_accessor,
                         coord_t num_out_blks,
                         coord_t reverse_dim_size,
-                        coord_t in_blk_size,
-                        coord_t output_size) {
-  cpu_reverse_forward_kernel(
-      in_ptr, out_ptr, num_out_blks, reverse_dim_size, in_blk_size);
+                        coord_t in_blk_size) {
+  DataTypeDispatch1<CPUReverseForwardKernel>{}(input_accessor.data_type,
+                                               input_accessor,
+                                               std::ref(output_accessor),
+                                               num_out_blks,
+                                               reverse_dim_size,
+                                               in_blk_size);
 }
 
-void cpu_backward_kernel(float const *out_grad_ptr,
-                         float *in_grad_ptr,
+void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor,
+                         GenericTensorAccessorW &input_accessor,
                          coord_t num_out_blks,
                          coord_t reverse_dim_size,
-                         coord_t in_blk_size,
-                         coord_t input_size) {
-  cpu_reverse_forward_kernel(
-      out_grad_ptr, in_grad_ptr, num_out_blks, reverse_dim_size, in_blk_size);
+                         coord_t in_blk_size) {
+  DataTypeDispatch1<CPUReverseForwardKernel>{}(output_accessor.data_type,
+                                               output_accessor,
+                                               std::ref(input_accessor),
+                                               num_out_blks,
+                                               reverse_dim_size,
+                                               in_blk_size);
 }
 
-} // namespace Reverse
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Reverse
diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu
index 68004738d2..ad216feda2 100644
--- a/lib/kernels/src/cuda/ops/concat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/concat_kernels.cu
@@ -17,9 +17,7 @@
 #include "kernels/concat_kernels.h"
 #include <cassert>
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Concat {
+namespace FlexFlow::Kernels::Concat {
 
 void calc_blk_size(size_t &num_blocks,
                    size_t &blk_size,
@@ -87,6 +85,4 @@ void backward_kernel(cudaStream_t stream,
   }
 }
 
-} // namespace Concat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Concat
diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc
index ced707edcc..5cf337c685 100644
--- a/lib/kernels/src/local_cpu_allocator.cc
+++ b/lib/kernels/src/local_cpu_allocator.cc
@@ -1,34 +1,16 @@
 #include "kernels/local_cpu_allocator.h"
 #include "kernels/device.h"
+#include "utils/containers/contains_key.h"
 
 namespace FlexFlow {
 void *LocalCPUAllocator::allocate(size_t requested_memory_size) {
   void *ptr = malloc(requested_memory_size);
-
-  if (ptr != nullptr) {
-    this->ptrs.insert(ptr);
-  } else {
-    throw std::bad_alloc();
-  }
-
-  return ptr;
-}
-
-void *LocalCPUAllocator::allocate_and_zero(size_t requested_memory_size) {
-  void *ptr = calloc(1, requested_memory_size);
-
-  if (ptr != nullptr) {
-    this->ptrs.insert(ptr);
-  } else {
-    throw std::bad_alloc();
-  }
-
+  this->ptrs.insert({ptr, std::unique_ptr<void, decltype(&free)>(ptr, free)});
   return ptr;
 }
 
 void LocalCPUAllocator::deallocate(void *ptr) {
-  if (contains(this->ptrs, ptr)) {
-    free(ptr);
+  if (contains_key(this->ptrs, ptr)) {
     this->ptrs.erase(ptr);
   } else {
     throw std::runtime_error(
@@ -36,15 +18,12 @@ void LocalCPUAllocator::deallocate(void *ptr) {
   }
 }
 
-LocalCPUAllocator::~LocalCPUAllocator() {
-  for (void *ptr : this->ptrs) {
-    free(ptr);
-  }
+DeviceType LocalCPUAllocator::get_allocation_device_type() const {
+  return DeviceType::CPU;
 }
 
 Allocator create_local_cpu_memory_allocator() {
   Allocator allocator = Allocator::create<LocalCPUAllocator>();
-  allocator.alloc_location = AllocLocation::HOST;
   return allocator;
 }
 
diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc
index b6c615a5ca..416768a479 100644
--- a/lib/kernels/src/local_cuda_allocator.cc
+++ b/lib/kernels/src/local_cuda_allocator.cc
@@ -10,14 +10,6 @@ void *LocalCudaAllocator::allocate(size_t requested_memory_size) {
   return ptr;
 }
 
-void *LocalCudaAllocator::allocate_and_zero(size_t requested_memory_size) {
-  void *ptr;
-  checkCUDA(cudaMalloc(&ptr, requested_memory_size));
-  checkCUDA(cudaMemset(ptr, 0, requested_memory_size));
-  this->ptrs.insert(ptr);
-  return ptr;
-}
-
 void LocalCudaAllocator::deallocate(void *ptr) {
   if (contains(this->ptrs, ptr)) {
     checkCUDA(cudaFree(ptr));
@@ -28,6 +20,10 @@ void LocalCudaAllocator::deallocate(void *ptr) {
   }
 }
 
+DeviceType LocalCudaAllocator::get_allocation_device_type() const {
+  return DeviceType::GPU;
+}
+
 LocalCudaAllocator::~LocalCudaAllocator() {
   for (void *ptr : this->ptrs) {
     checkCUDA(cudaFree(ptr));
@@ -36,7 +32,6 @@ LocalCudaAllocator::~LocalCudaAllocator() {
 
 Allocator create_local_cuda_memory_allocator() {
   Allocator allocator = Allocator::create<LocalCudaAllocator>();
-  allocator.alloc_location = AllocLocation::DEVICE;
   return allocator;
 }
 
diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt
index 00da2d0d70..066cb96753 100644
--- a/lib/kernels/test/CMakeLists.txt
+++ b/lib/kernels/test/CMakeLists.txt
@@ -14,6 +14,7 @@ ff_add_test_executable(
     cudnn
     cudart
     cublas
+    pcg
 )
 
 set(FF_TEST_EXEC_NAME "kernels-tests")
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index a6990d2ad0..e9674cd167 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -94,44 +94,34 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     // Only calling forward kernel as backward kernel is exactly the same
     SUBCASE("forward_kernel") {
-      auto transform = [start_val = 1.1f,
-                        counter = 0.0f](float input) mutable -> float {
-        return start_val + counter++;
-      };
-
       // Run GPU Forward Kernel
       GenericTensorAccessorW input_accessor_gpu =
-          create_transformed_accessor_w<float, float>(
-              input_shape, gpu_allocator, transform);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           gpu_allocator);
       Kernels::Cast::forward_kernel(
           managed_stream.raw_stream(),
           read_only_accessor_from_write_accessor(input_accessor_gpu),
           output_accessor_gpu,
           DataType::FLOAT,
           DataType::INT32);
-      std::cout << "Before GPU load" << std::endl;
+
       std::vector<int32_t> result_data_gpu =
           load_accessor_data<DataType::INT32>(output_accessor_gpu);
 
       // Run CPU Forward Kernel
       GenericTensorAccessorW input_accessor_cpu =
-          create_transformed_accessor_w<float, float>(
-              input_shape, cpu_allocator, transform);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           cpu_allocator);
       Kernels::Cast::cpu_forward_kernel(
           read_only_accessor_from_write_accessor(input_accessor_cpu),
           output_accessor_cpu,
           DataType::FLOAT,
           DataType::INT32);
-      std::cout << "Before CPU load" << std::endl;
-      if (output_accessor_cpu.on_device) {
-        std::cout << "CPU data is on device" << std::endl;
-      } else {
-        std::cout << "CPU data is on host" << std::endl;
-      }
+
       std::vector<int32_t> result_data_cpu =
           load_accessor_data<DataType::INT32>(output_accessor_cpu);
 
-      CHECK(result_data_gpu == result_data_cpu);
+      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index e952f1107f..8af741b3a7 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -112,17 +112,12 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("Check Replicate Forward Kernel against CPU Kernel") {
-    std::size_t num_replicas = 10;
+    std::size_t num_replicas = 2;
 
-    // This should be like three shapes: pre_replication, replication shape, and
-    // reduced shape, but things are weird cause doesn't seem to be replicating
-    // anything (ie. input shape should be same as reduced shape)
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT);
-    TensorShape replicated_shape =
-        make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT);
-    TensorShape reduced_shape =
-        make_tensor_shape_from_legion_dims({10}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({5}, DataType::FLOAT);
+    TensorShape output_shape =
+        make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT);
 
     ManagedPerDeviceFFHandle managed_handle{};
     ManagedFFStream managed_stream{};
@@ -136,7 +131,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           create_random_filled_accessor_r<DataType::FLOAT>(input_shape,
                                                            gpu_allocator);
       GenericTensorAccessorW output_accessor_gpu =
-          gpu_allocator.allocate_tensor(replicated_shape);
+          gpu_allocator.allocate_tensor(output_shape);
+      fill_with_zeros(output_accessor_gpu);
 
       Kernels::Replicate::forward_kernel(
           managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
@@ -145,29 +141,29 @@ TEST_SUITE(FF_TEST_SUITE) {
           load_accessor_data<DataType::FLOAT>(output_accessor_gpu);
 
       // Run CPU Replicate Forward Kernel
-      GenericTensorAccessorW input_accessor_cpu =
-          copy_tensor_between_memories<DataType::FLOAT>(input_accessor_gpu,
-                                                        cpu_allocator);
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
       GenericTensorAccessorW output_accessor_cpu =
-          cpu_allocator.allocate_tensor(replicated_shape);
+          cpu_allocator.allocate_tensor(output_shape);
+      fill_with_zeros(output_accessor_cpu);
 
-      Kernels::Replicate::cpu_forward_kernel(
-          read_only_accessor_from_write_accessor(input_accessor_cpu),
-          output_accessor_cpu);
+      Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu,
+                                             output_accessor_cpu);
 
       std::vector<float> result_data_cpu =
           load_accessor_data<DataType::FLOAT>(output_accessor_cpu);
 
-      CHECK(result_data_gpu == result_data_cpu);
+      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
     }
 
     SUBCASE("backward_kernel") {
       // Run GPU Replicate Backward Kernel
       GenericTensorAccessorR output_grad_accessor_gpu =
-          create_random_filled_accessor_r<DataType::FLOAT>(replicated_shape,
+          create_random_filled_accessor_r<DataType::FLOAT>(output_shape,
                                                            gpu_allocator);
       GenericTensorAccessorW input_grad_accessor_gpu =
-          gpu_allocator.allocate_tensor_and_zero(reduced_shape);
+          gpu_allocator.allocate_tensor(input_shape);
+      fill_with_zeros(input_grad_accessor_gpu);
 
       Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
                                           input_grad_accessor_gpu,
@@ -178,21 +174,20 @@ TEST_SUITE(FF_TEST_SUITE) {
           load_accessor_data<DataType::FLOAT>(input_grad_accessor_gpu);
 
       // Run CPU Replicate Backward Kernel
-      GenericTensorAccessorW output_grad_accessor_cpu =
-          copy_tensor_between_memories<DataType::FLOAT>(
-              output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+
       GenericTensorAccessorW input_grad_accessor_cpu =
-          cpu_allocator.allocate_tensor_and_zero(reduced_shape);
+          cpu_allocator.allocate_tensor(input_shape);
+      fill_with_zeros(input_grad_accessor_cpu);
 
       Kernels::Replicate::cpu_backward_kernel(
-          input_grad_accessor_cpu,
-          read_only_accessor_from_write_accessor(output_grad_accessor_cpu),
-          num_replicas);
+          input_grad_accessor_cpu, output_grad_accessor_cpu, num_replicas);
 
       std::vector<float> result_data_cpu =
           load_accessor_data<DataType::FLOAT>(input_grad_accessor_cpu);
 
-      CHECK(result_data_gpu == result_data_cpu);
+      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index 7899afa718..b1f90a0a7e 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -159,11 +159,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       };
 
       // Run GPU Cast Forward Kernel
-      GenericTensorAccessorW input_accessor_gpu =
-          create_transformed_accessor_w<float, float>(
-              input_shape, gpu_allocator, transform);
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r<DataType::FLOAT>(input_shape,
+                                                           gpu_allocator);
       GenericTensorAccessorW output_accessor_gpu =
           gpu_allocator.allocate_tensor(output_shape);
+      fill_with_zeros(output_accessor_gpu);
 
       Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
                                        input_accessor_gpu.get_float_ptr(),
@@ -177,33 +178,32 @@ TEST_SUITE(FF_TEST_SUITE) {
           load_accessor_data<DataType::FLOAT>(output_accessor_gpu);
 
       // Run CPU Cast Forward Kernel
-      GenericTensorAccessorW input_accessor_cpu =
-          create_transformed_accessor_w<float, float>(
-              input_shape, cpu_allocator, transform);
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
       GenericTensorAccessorW output_accessor_cpu =
           cpu_allocator.allocate_tensor(output_shape);
+      fill_with_zeros(output_accessor_cpu);
 
-      Kernels::Reverse::cpu_forward_kernel(
-          input_accessor_cpu.get_float_ptr(),
-          output_accessor_cpu.get_float_ptr(),
-          num_out_blks,
-          reverse_dim_size,
-          in_blk_size,
-          input_accessor_cpu.shape.num_elements());
+      Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu,
+                                           output_accessor_cpu,
+                                           num_out_blks,
+                                           reverse_dim_size,
+                                           in_blk_size);
 
       std::vector<float> result_data_cpu =
           load_accessor_data<DataType::FLOAT>(output_accessor_cpu);
 
-      CHECK(result_data_gpu == result_data_cpu);
+      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
     }
 
     SUBCASE("backward_kernel") {
       // Run GPU Cast Backward Kernel
-      GenericTensorAccessorW output_grad_accessor_gpu =
-          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r<DataType::FLOAT>(output_shape,
                                                            gpu_allocator);
       GenericTensorAccessorW input_grad_accessor_gpu =
           gpu_allocator.allocate_tensor(input_shape);
+      fill_with_zeros(input_grad_accessor_gpu);
 
       Kernels::Reverse::backward_kernel(
           managed_stream.raw_stream(),
@@ -218,25 +218,22 @@ TEST_SUITE(FF_TEST_SUITE) {
           load_accessor_data<DataType::FLOAT>(input_grad_accessor_gpu);
 
       // Run CPU Cast Backward Kernel
-      GenericTensorAccessorW output_grad_accessor_cpu =
-          copy_tensor_between_memories<DataType::FLOAT>(
-              read_only_accessor_from_write_accessor(output_grad_accessor_gpu),
-              cpu_allocator);
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
       GenericTensorAccessorW input_grad_accessor_cpu =
           cpu_allocator.allocate_tensor(input_shape);
+      fill_with_zeros(input_grad_accessor_cpu);
 
-      Kernels::Reverse::cpu_backward_kernel(
-          output_grad_accessor_cpu.get_float_ptr(),
-          input_grad_accessor_cpu.get_float_ptr(),
-          num_out_blks,
-          reverse_dim_size,
-          in_blk_size,
-          input_grad_accessor_cpu.shape.num_elements());
+      Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu,
+                                            input_grad_accessor_cpu,
+                                            num_out_blks,
+                                            reverse_dim_size,
+                                            in_blk_size);
 
       std::vector<float> result_data_cpu =
           load_accessor_data<DataType::FLOAT>(input_grad_accessor_cpu);
 
-      CHECK(result_data_gpu == result_data_cpu);
+      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
index d4511c9dc5..a9d522b948 100644
--- a/lib/kernels/test/src/test_utils.h
+++ b/lib/kernels/test/src/test_utils.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_TEST_UTILS
 #define _FLEXFLOW_KERNELS_TEST_UTILS
 
+#include "kernels/datatype_dispatch.h"
 #include "kernels/device.h"
 #include "kernels/local_cpu_allocator.h"
 #include "kernels/local_cuda_allocator.h"
@@ -17,18 +18,16 @@ using namespace FlexFlow;
 template <typename DT>
 void transfer_memory(GenericTensorAccessorW dst_accessor,
                      const DT *src,
-                     AllocLocation src_loc) {
+                     DeviceType src_device_type) {
   size_t bytes = dst_accessor.shape.get_volume() * sizeof(DT);
-  AllocLocation dst_loc =
-      dst_accessor.on_device ? AllocLocation::DEVICE : AllocLocation::HOST;
 
-  if (src_loc == AllocLocation::HOST && dst_loc == AllocLocation::HOST) {
+  DeviceType dst_device_type = dst_accessor.device_type;
+
+  if (device_on_cpu(src_device_type) && device_on_cpu(dst_device_type)) {
     memcpy(dst_accessor.ptr, src, bytes);
-  } else if (src_loc == AllocLocation::HOST &&
-             dst_loc == AllocLocation::DEVICE) {
+  } else if (device_on_cpu(src_device_type) && device_on_gpu(dst_device_type)) {
     checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyHostToDevice));
-  } else if (src_loc == AllocLocation::DEVICE &&
-             dst_loc == AllocLocation::HOST) {
+  } else if (device_on_gpu(src_device_type) && device_on_cpu(dst_device_type)) {
     checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToHost));
   } else {
     checkCUDA(
@@ -41,11 +40,10 @@ GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
                                                        Allocator &allocator) {
   assert(shape.data_type == DataType::FLOAT ||
          shape.data_type == DataType::DOUBLE);
-  using T = real_type<DT>;
+
+  using T = real_type_t<DT>;
 
   GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  accessor.on_device =
-      (allocator.alloc_location == AllocLocation::DEVICE) ? true : false;
 
   std::vector<T> host_data(accessor.shape.num_elements());
   std::random_device rd;
@@ -56,7 +54,7 @@ GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
     val = dist(gen);
   }
 
-  transfer_memory(accessor, host_data.data(), AllocLocation::HOST);
+  transfer_memory(accessor, host_data.data(), DeviceType::CPU);
 
   return accessor;
 }
@@ -64,103 +62,64 @@ GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
 template <DataType DT>
 GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape,
                                                        Allocator &allocator) {
+  using T = real_type_t<DT>;
   GenericTensorAccessorW accessor =
       create_random_filled_accessor_w<DT>(shape, allocator);
 
   return read_only_accessor_from_write_accessor(accessor);
 }
 
-template <typename DT>
+template <typename T>
 GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
                                                 Allocator &allocator,
-                                                DT val) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  accessor.on_device =
-      (allocator.alloc_location == AllocLocation::DEVICE) ? true : false;
-
-  size_t volume = accessor.shape.get_volume();
-  std::vector<DT> host_data(volume, val);
-
-  transfer_memory(accessor, host_data.data(), AllocLocation::HOST);
-
-  return accessor;
-}
-
-template <typename IDT, typename ODT, typename F>
-GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape,
-                                                     Allocator &allocator,
-                                                     F transform) {
+                                                T val) {
   GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  accessor.on_device =
-      (allocator.alloc_location == AllocLocation::DEVICE) ? true : false;
 
   size_t volume = accessor.shape.get_volume();
-  std::vector<IDT> input_data(volume);
-  std::vector<ODT> output_data(volume);
-
-  std::transform(
-      input_data.begin(), input_data.end(), output_data.begin(), transform);
+  std::vector<T> host_data(volume, val);
 
-  transfer_memory(accessor, output_data.data(), AllocLocation::HOST);
+  transfer_memory(accessor, host_data.data(), DeviceType::CPU);
 
   return accessor;
 }
 
 template <DataType DT>
-GenericTensorAccessorW
-    copy_tensor_between_memories(GenericTensorAccessorR accessor,
-                                 Allocator &allocator) {
-  TensorShape shape = get_tensor_shape(accessor.shape, accessor.data_type);
-  GenericTensorAccessorW copied_accessor = allocator.allocate_tensor(shape);
-  copied_accessor.on_device =
-      (allocator.alloc_location == AllocLocation::DEVICE) ? true : false;
-
-  AllocLocation src_loc =
-      accessor.on_device ? AllocLocation::DEVICE : AllocLocation::HOST;
-
-  transfer_memory(copied_accessor, accessor.get<DT>(), src_loc);
-
-  return copied_accessor;
-}
-
-TensorShape make_tensor_shape_from_legion_dims(FFOrdered<size_t> dims,
-                                               DataType DT);
-
-template <DataType DT>
-std::vector<real_type<DT>> load_accessor_data(GenericTensorAccessorR accessor) {
-  using T = real_type<DT>;
+std::vector<real_type_t<DT>>
+    load_accessor_data(GenericTensorAccessorR accessor) {
+  using T = real_type_t<DT>;
 
   int volume = accessor.shape.get_volume();
   std::vector<T> local_data(volume);
   T const *src_ptr = accessor.get<DT>();
 
-  if (accessor.on_device) {
+  if (device_on_cpu(accessor.device_type)) {
+    memcpy(local_data.data(), src_ptr, volume * sizeof(T));
+  } else {
     checkCUDA(cudaMemcpy(local_data.data(),
                          src_ptr,
                          volume * sizeof(T),
                          cudaMemcpyDeviceToHost));
-  } else {
-    memcpy(local_data.data(), src_ptr, volume * sizeof(T));
   }
 
   return local_data;
 }
 
 template <DataType DT>
-std::vector<real_type<DT>> load_accessor_data(GenericTensorAccessorW accessor) {
-  using T = real_type<DT>;
+std::vector<real_type_t<DT>>
+    load_accessor_data(GenericTensorAccessorW accessor) {
+  using T = real_type_t<DT>;
 
   int volume = accessor.shape.get_volume();
   std::vector<T> local_data(volume);
   T const *src_ptr = accessor.get<DT>();
 
-  if (accessor.on_device) {
+  if (device_on_cpu(accessor.device_type)) {
+    memcpy(local_data.data(), src_ptr, volume * sizeof(T));
+  } else {
     checkCUDA(cudaMemcpy(local_data.data(),
                          src_ptr,
                          volume * sizeof(T),
                          cudaMemcpyDeviceToHost));
-  } else {
-    memcpy(local_data.data(), src_ptr, volume * sizeof(T));
   }
 
   return local_data;
diff --git a/lib/local-execution/include/local-execution/local_cpu_allocator.h b/lib/local-execution/include/local-execution/local_cpu_allocator.h
index d1e81facf2..cf6cfe35d1 100644
--- a/lib/local-execution/include/local-execution/local_cpu_allocator.h
+++ b/lib/local-execution/include/local-execution/local_cpu_allocator.h
@@ -12,6 +12,8 @@ struct LocalCPUAllocator : public IAllocator {
   void *allocate(size_t) override;
   void deallocate(void *) override;
 
+  DeviceType get_allocation_device_type() const override;
+
 private:
   std::unordered_map<void *, std::unique_ptr<void, decltype(&free)>> ptrs;
 };
diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h
index d6f338fe14..f697337c52 100644
--- a/lib/local-execution/include/local-execution/tracked_allocator.h
+++ b/lib/local-execution/include/local-execution/tracked_allocator.h
@@ -12,8 +12,10 @@ struct TrackedAllocator : public IAllocator {
   ~TrackedAllocator() = default;
 
   void *allocate(size_t) override;
-  void *allocate_and_zero(size_t) override;
   void deallocate(void *) override;
+
+  DeviceType get_allocation_device_type() const override;
+
   size_t get_current_mem_usage();
 
 private:
diff --git a/lib/local-execution/src/local_cpu_allocator.cc b/lib/local-execution/src/local_cpu_allocator.cc
index 4ca5f987a8..c4657e26b5 100644
--- a/lib/local-execution/src/local_cpu_allocator.cc
+++ b/lib/local-execution/src/local_cpu_allocator.cc
@@ -17,6 +17,10 @@ void LocalCPUAllocator::deallocate(void *ptr) {
   }
 }
 
+DeviceType LocalCPUAllocator::get_allocation_device_type() const {
+  return DeviceType::CPU;
+}
+
 Allocator create_local_cpu_memory_allocator() {
   return Allocator::create<LocalCPUAllocator>();
 }
diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc
index 54eca7e514..f61ed7bc7b 100644
--- a/lib/local-execution/src/local_task_argument_accessor.cc
+++ b/lib/local-execution/src/local_task_argument_accessor.cc
@@ -25,7 +25,10 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
       this->tensor_slots_backing.at(slot_grad_pair));
   if (priv == Permissions::RO) {
     GenericTensorAccessorR readonly_tensor_backing = {
-        tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr};
+        tensor_backing.data_type,
+        tensor_backing.shape,
+        tensor_backing.ptr,
+        this->allocator.get_allocation_device_type()};
     return readonly_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
     return tensor_backing;
@@ -33,6 +36,7 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
     throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv));
   }
 }
+
 VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
     slot_id_t slot, Permissions priv, IsGrad is_grad) const {
   SlotGradId slot_grad_pair = SlotGradId{slot, is_grad};
@@ -43,7 +47,10 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
     for (GenericTensorAccessorW const &tensor_backing :
          variadic_tensor_backing) {
       readonly_variadic_tensor_backing.push_back(
-          {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr});
+          {tensor_backing.data_type,
+           tensor_backing.shape,
+           tensor_backing.ptr,
+           this->allocator.get_allocation_device_type()});
     }
     return readonly_variadic_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc
index 9f13f006f3..ed181aea32 100644
--- a/lib/local-execution/src/tracked_allocator.cc
+++ b/lib/local-execution/src/tracked_allocator.cc
@@ -12,12 +12,6 @@ void *TrackedAllocator::allocate(size_t requested_memory_size) {
   return ptr;
 }
 
-void *TrackedAllocator::allocate_and_zero(size_t requested_memory_size) {
-  void *ptr = this->allocator.allocate_and_zero(requested_memory_size);
-  this->current_mem_usage += requested_memory_size;
-  return ptr;
-}
-
 void TrackedAllocator::deallocate(void *ptr) {
   size_t psize;
   this->ptr_mem_usage.erase(ptr);
@@ -29,9 +23,12 @@ size_t TrackedAllocator::get_current_mem_usage() {
   return this->current_mem_usage;
 }
 
+DeviceType TrackedAllocator::get_allocation_device_type() const {
+  return this->allocator.get_allocation_device_type();
+}
+
 Allocator get_tracked_memory_allocator(Allocator const &base_allocator) {
   Allocator allocator = Allocator::create<TrackedAllocator>(base_allocator);
-  allocator.alloc_location = base_allocator.alloc_location;
   return allocator;
 }
 

From 0304f17e77563c35ca9daa6c3c6bcd9a4a5bb2a1 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Tue, 8 Oct 2024 00:26:05 -0700
Subject: [PATCH 08/42] accessor.h formatting

---
 lib/kernels/include/kernels/accessor.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index 846115060f..e63e77d0ad 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -31,7 +31,10 @@ class GenericTensorAccessorW {
 
   GenericTensorAccessorW() = delete;
 
-  GenericTensorAccessorW(DataType data_type, ArrayShape const &shape, void *ptr, DeviceType device_type);
+  GenericTensorAccessorW(DataType data_type,
+                         ArrayShape const &shape,
+                         void *ptr,
+                         DeviceType device_type);
 
   bool operator==(GenericTensorAccessorW const &) const;
   bool operator!=(GenericTensorAccessorW const &) const;

From 7c3ff87421165a22d8e20dee5eaafb4bd3aa51f5 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Tue, 8 Oct 2024 00:47:55 -0700
Subject: [PATCH 09/42] mk_runtime_error formatting

---
 lib/kernels/include/kernels/accessor.h | 10 +++++-----
 lib/kernels/src/accessor.cc            | 16 ++++++++--------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index e63e77d0ad..e29f73924c 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -45,8 +45,8 @@ class GenericTensorAccessorW {
       throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
     }
     if (this->data_type != DT) {
-      throw mk_runtime_error(
-          "Invalid access data type ({} != {})", this->data_type, DT);
+      throw mk_runtime_error(fmt::format(
+          "Invalid access data type ({} != {})", this->data_type, DT));
     }
 
     using T = real_type_t<DT>;
@@ -63,8 +63,8 @@ class GenericTensorAccessorW {
       throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
     }
     if (this->data_type != DT) {
-      throw mk_runtime_error(
-          "Invalid access data type ({} != {})", this->data_type, DT);
+      throw mk_runtime_error(fmt::format(
+          "Invalid access data type ({} != {})", this->data_type, DT));
     }
 
     using T = real_type_t<DT>;
@@ -130,7 +130,7 @@ class GenericTensorAccessorR {
     }
     if (this->data_type != DT) {
       throw mk_runtime_error(
-          "Invalid access data type ({} != {})", this->data_type, DT);
+          fmt::format("Invalid access data type ({} != {})", this->data_type, DT));
     }
 
     using T = real_type_t<DT>;
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
index c0b11a2299..a2b3e94d33 100644
--- a/lib/kernels/src/accessor.cc
+++ b/lib/kernels/src/accessor.cc
@@ -21,10 +21,10 @@ size_t GenericTensorAccessorW::calculate_index_offset(
     std::initializer_list<size_t> const &indices) const {
 
   if (indices.size() != this->shape.num_dims()) {
-    throw mk_runtime_error(
+    throw mk_runtime_error(fmt::format(
         "Number of indices ({}) does not match the number of dimensions ({}).",
         indices.size(),
-        this->shape.num_dims());
+        this->shape.num_dims()));
   }
 
   size_t offset = 0;
@@ -36,11 +36,11 @@ size_t GenericTensorAccessorW::calculate_index_offset(
     cur_idx = *it--;
 
     if (cur_idx >= this->shape[legion_dim_t(i)]) {
-      throw mk_runtime_error("In {} dimension, attempting to access index {} "
+      throw mk_runtime_error(fmt::format("In {} dimension, attempting to access index {} "
                              "when only {} indexes exist",
                              i,
                              cur_idx,
-                             this->shape[legion_dim_t(i)]);
+                             this->shape[legion_dim_t(i)]));
     }
 
     offset += cur_idx * multiplier;
@@ -110,10 +110,10 @@ size_t GenericTensorAccessorR::calculate_index_offset(
     std::initializer_list<size_t> const &indices) const {
 
   if (indices.size() != this->shape.num_dims()) {
-    throw mk_runtime_error(
+    throw mk_runtime_error(fmt::format(
         "Number of indices ({}) does not match the number of dimensions ({}).",
         indices.size(),
-        this->shape.num_dims());
+        this->shape.num_dims()));
   }
 
   size_t offset = 0;
@@ -125,11 +125,11 @@ size_t GenericTensorAccessorR::calculate_index_offset(
     cur_idx = *it--;
 
     if (cur_idx >= this->shape[legion_dim_t(i)]) {
-      throw mk_runtime_error("In {} dimension, attempting to access index {} "
+      throw mk_runtime_error(fmt::format("In {} dimension, attempting to access index {} "
                              "when only {} indexes exist",
                              i,
                              cur_idx,
-                             this->shape[legion_dim_t(i)]);
+                             this->shape[legion_dim_t(i)]));
     }
 
     offset += cur_idx * multiplier;

From 65d78049c2d2cb933e5cf2be9545ee00693a9b97 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Tue, 8 Oct 2024 01:08:59 -0700
Subject: [PATCH 10/42] reverse_kernels include

---
 lib/kernels/src/cpu/reverse_kernels.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc
index 1971435d8c..afa92b307c 100644
--- a/lib/kernels/src/cpu/reverse_kernels.cc
+++ b/lib/kernels/src/cpu/reverse_kernels.cc
@@ -1,4 +1,5 @@
 #include "kernels/reverse_kernels_cpu.h"
+#include "kernels/datatype_dispatch.h"
 #include <algorithm>
 #include <vector>
 

From 7c5fb1fa853fb91f0245a84910c0aa86a2f89db4 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Mon, 14 Oct 2024 23:40:12 -0700
Subject: [PATCH 11/42] test_utils refactor and clarity

---
 lib/kernels/include/kernels/accessor.h        |  22 ++-
 lib/kernels/include/kernels/allocation.h      |   2 +-
 .../include/kernels/replicate_kernels.h       |   2 +-
 .../include/kernels/replicate_kernels_cpu.h   |   6 +-
 .../include/kernels/reverse_kernels_cpu.h     |  14 +-
 lib/kernels/src/accessor.cc                   | 130 ++++++++++++++----
 lib/kernels/src/allocation.cc                 |   6 +-
 lib/kernels/src/cpu/replicate_kernels.cc      |  53 +++----
 lib/kernels/src/cpu/reverse_kernels.cc        |  63 +++------
 lib/kernels/src/cuda/ops/replicate_kernels.cu |   6 +-
 lib/kernels/src/cuda/ops/reverse_kernels.cu   |  13 +-
 lib/kernels/test/src/test_attention_kernel.cc |  26 ++--
 .../test/src/test_batch_matmul_kernel.cc      |  12 +-
 .../test/src/test_batch_norm_kernel.cc        |  15 +-
 lib/kernels/test/src/test_cast_kernel.cc      |  49 -------
 lib/kernels/test/src/test_dropout.cc          |   6 +-
 lib/kernels/test/src/test_gather_kernels.cc   |   3 +-
 .../test/src/test_layer_norm_kernels.cc       |   3 +-
 lib/kernels/test/src/test_pool_2d_kernels.cc  |   6 +-
 lib/kernels/test/src/test_replicate_kernel.cc |  80 -----------
 lib/kernels/test/src/test_reverse_kernels.cc  | 102 +-------------
 lib/kernels/test/src/test_softmax_kernel.cc   |   6 +-
 lib/kernels/test/src/test_split_kernel.cc     |   6 +-
 lib/kernels/test/src/test_transpose_kernel.cc |   3 +-
 lib/kernels/test/src/test_utils.cc            |  77 +++++++++++
 lib/kernels/test/src/test_utils.h             | 130 ++++--------------
 .../src/local_task_argument_accessor.cc       |  12 +-
 lib/local-execution/src/ops/replicate.cc      |   2 +-
 28 files changed, 333 insertions(+), 522 deletions(-)

diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index e29f73924c..0a134db695 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -11,6 +11,8 @@
 
 namespace FlexFlow {
 
+struct Allocator;
+
 class GenericTensorAccessorW {
 public:
   template <DataType DT>
@@ -129,8 +131,8 @@ class GenericTensorAccessorR {
       throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
     }
     if (this->data_type != DT) {
-      throw mk_runtime_error(
-          fmt::format("Invalid access data type ({} != {})", this->data_type, DT));
+      throw mk_runtime_error(fmt::format(
+          "Invalid access data type ({} != {})", this->data_type, DT));
     }
 
     using T = real_type_t<DT>;
@@ -255,6 +257,22 @@ std::pair<ArrayShape, DataType>
 std::pair<ArrayShape, DataType>
     get_shape_and_datatype(GenericTensorAccessorW const &accessor);
 
+void transfer_data_between_accessors(
+    GenericTensorAccessorW &dst_accessor,
+    GenericTensorAccessorR const &src_accessor);
+
+void transfer_data_between_accessors(
+    GenericTensorAccessorW &dst_accessor,
+    GenericTensorAccessorW const &src_accessor);
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
+                           Allocator &allocator);
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
+                           Allocator &allocator);
+
 } // namespace FlexFlow
 
 namespace FlexFlow {
diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h
index 893be513ea..4bf97118ce 100644
--- a/lib/kernels/include/kernels/allocation.h
+++ b/lib/kernels/include/kernels/allocation.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_ALLOCATION_H
 #define _FLEXFLOW_KERNELS_ALLOCATION_H
 
-#include "accessor.h"
+#include "kernels/accessor.h"
 #include <cstddef>
 #include <memory>
 
diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h
index 877eeabf04..7ed55cd1a1 100644
--- a/lib/kernels/include/kernels/replicate_kernels.h
+++ b/lib/kernels/include/kernels/replicate_kernels.h
@@ -11,8 +11,8 @@ void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input,
                      size_t num_replicas);
 
 } // namespace FlexFlow::Kernels::Replicate
diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h
index a72b799875..1c7aa4ee4a 100644
--- a/lib/kernels/include/kernels/replicate_kernels_cpu.h
+++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h
@@ -7,10 +7,10 @@
 namespace FlexFlow::Kernels::Replicate {
 
 void cpu_forward_kernel(GenericTensorAccessorR const &input,
-                        GenericTensorAccessorW const &output);
+                        GenericTensorAccessorW &output);
 
-void cpu_backward_kernel(GenericTensorAccessorW const &input,
-                         GenericTensorAccessorR const &output,
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW &input,
                          size_t num_replicas);
 
 } // namespace FlexFlow::Kernels::Replicate
diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h
index b0edaa264c..35af06aafb 100644
--- a/lib/kernels/include/kernels/reverse_kernels_cpu.h
+++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h
@@ -1,22 +1,16 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
 #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
 
-#include "accessor.h"
-#include "device.h"
+#include "kernels/accessor.h"
+#include "kernels/device.h"
 
 namespace FlexFlow::Kernels::Reverse {
 
 void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
-                        GenericTensorAccessorW &output_accessor,
-                        coord_t num_out_blks,
-                        coord_t reverse_dim_size,
-                        coord_t in_blk_size);
+                        GenericTensorAccessorW &output_accessor);
 
 void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor,
-                         GenericTensorAccessorW &input_accessor,
-                         coord_t num_out_blks,
-                         coord_t reverse_dim_size,
-                         coord_t in_blk_size);
+                         GenericTensorAccessorW &input_accessor);
 } // namespace FlexFlow::Kernels::Reverse
 
 #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
index a2b3e94d33..9332dd6703 100644
--- a/lib/kernels/src/accessor.cc
+++ b/lib/kernels/src/accessor.cc
@@ -1,7 +1,45 @@
 #include "kernels/accessor.h"
+#include "kernels/allocation.h"
+#include "kernels/datatype_dispatch.h"
 
 namespace FlexFlow {
 
+void transfer_data_between_accessors(
+    GenericTensorAccessorW &dst_accessor,
+    GenericTensorAccessorR const &src_accessor) {
+  size_t num_bytes = dst_accessor.shape.get_volume() *
+                     size_of_datatype(dst_accessor.data_type);
+
+  DeviceType dst_device_type = dst_accessor.device_type;
+  DeviceType src_device_type = src_accessor.device_type;
+
+  if (src_device_type == DeviceType::CPU &&
+      dst_device_type == DeviceType::CPU) {
+    memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes);
+  } else if (src_device_type == DeviceType::CPU &&
+             dst_device_type == DeviceType::GPU) {
+    checkCUDA(cudaMemcpy(
+        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice));
+  } else if (src_device_type == DeviceType::GPU &&
+             dst_device_type == DeviceType::CPU) {
+    checkCUDA(cudaMemcpy(
+        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost));
+  } else {
+    checkCUDA(cudaMemcpy(dst_accessor.ptr,
+                         src_accessor.ptr,
+                         num_bytes,
+                         cudaMemcpyDeviceToDevice));
+  }
+}
+
+void transfer_data_between_accessors(
+    GenericTensorAccessorW &dst_accessor,
+    GenericTensorAccessorW const &src_accessor) {
+  GenericTensorAccessorR r_src_accessor =
+      read_only_accessor_from_write_accessor(src_accessor);
+  transfer_data_between_accessors(dst_accessor, r_src_accessor);
+}
+
 GenericTensorAccessorW::GenericTensorAccessorW(
     DataType data_type,
     ArrayShape const &shape,
@@ -30,21 +68,22 @@ size_t GenericTensorAccessorW::calculate_index_offset(
   size_t offset = 0;
   size_t multiplier = 1;
   size_t cur_idx;
-  auto it = indices.end() - 1;
-
-  for (std::size_t i = this->shape.num_dims(); i-- > 0;) {
-    cur_idx = *it--;
-
-    if (cur_idx >= this->shape[legion_dim_t(i)]) {
-      throw mk_runtime_error(fmt::format("In {} dimension, attempting to access index {} "
-                             "when only {} indexes exist",
-                             i,
-                             cur_idx,
-                             this->shape[legion_dim_t(i)]));
+  auto it = indices.begin();
+
+  for (size_t i = 0; i < this->shape.num_dims(); i++) {
+    cur_idx = *it++;
+
+    if (cur_idx >= this->shape.at(legion_dim_t(i))) {
+      throw mk_runtime_error(
+          fmt::format("In {} dimension, attempting to access index {} "
+                      "when only {} indexes exist",
+                      i,
+                      cur_idx,
+                      this->shape.at(legion_dim_t(i))));
     }
 
     offset += cur_idx * multiplier;
-    multiplier *= this->shape[legion_dim_t(i)];
+    multiplier *= this->shape.at(legion_dim_t(i));
   }
 
   return offset;
@@ -119,21 +158,22 @@ size_t GenericTensorAccessorR::calculate_index_offset(
   size_t offset = 0;
   size_t multiplier = 1;
   size_t cur_idx;
-  auto it = indices.end() - 1;
-
-  for (std::size_t i = this->shape.num_dims(); i-- > 0;) {
-    cur_idx = *it--;
-
-    if (cur_idx >= this->shape[legion_dim_t(i)]) {
-      throw mk_runtime_error(fmt::format("In {} dimension, attempting to access index {} "
-                             "when only {} indexes exist",
-                             i,
-                             cur_idx,
-                             this->shape[legion_dim_t(i)]));
+  auto it = indices.begin();
+
+  for (size_t i = 0; i < this->shape.num_dims(); i++) {
+    cur_idx = *it++;
+
+    if (cur_idx >= this->shape.at(legion_dim_t(i))) {
+      throw mk_runtime_error(
+          fmt::format("In {} dimension, attempting to access index {} "
+                      "when only {} indexes exist",
+                      i,
+                      cur_idx,
+                      this->shape.at(legion_dim_t(i))));
     }
 
     offset += cur_idx * multiplier;
-    multiplier *= this->shape[legion_dim_t(i)];
+    multiplier *= this->shape.at(legion_dim_t(i));
   }
 
   return offset;
@@ -307,4 +347,46 @@ std::pair<ArrayShape, DataType>
   return std::make_pair(accessor.shape, accessor.data_type);
 }
 
+template <DataType DT>
+struct CopyTensorAccessorW {
+  GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor,
+                                    Allocator &allocator) {
+    TensorShape shape =
+        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+
+    transfer_data_between_accessors(dst_accessor, src_accessor);
+
+    return dst_accessor;
+  }
+};
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
+                           Allocator &allocator) {
+  return DataTypeDispatch1<CopyTensorAccessorW>{}(
+      src_accessor.data_type, src_accessor, std::ref(allocator));
+}
+
+template <DataType DT>
+struct CopyTensorAccessorR {
+  GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor,
+                                    Allocator &allocator) {
+    TensorShape shape =
+        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+
+    transfer_data_between_accessors(dst_accessor, src_accessor);
+
+    return read_only_accessor_from_write_accessor(dst_accessor);
+  }
+};
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
+                           Allocator &allocator) {
+  return DataTypeDispatch1<CopyTensorAccessorR>{}(
+      src_accessor.data_type, src_accessor, std::ref(allocator));
+}
+
 } // namespace FlexFlow
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
index 751cdc0ebb..733146851a 100644
--- a/lib/kernels/src/allocation.cc
+++ b/lib/kernels/src/allocation.cc
@@ -18,8 +18,10 @@ DeviceType Allocator::get_allocation_device_type() const {
 GenericTensorAccessorW
     Allocator::allocate_tensor(TensorShape const &tensor_shape) {
   void *ptr = this->allocate(get_size_in_bytes(tensor_shape));
-  return {
-      tensor_shape.data_type, tensor_shape, ptr, get_allocation_device_type()};
+  return {tensor_shape.data_type,
+          tensor_shape,
+          ptr,
+          this->get_allocation_device_type()};
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc
index 5853869047..683739b91e 100644
--- a/lib/kernels/src/cpu/replicate_kernels.cc
+++ b/lib/kernels/src/cpu/replicate_kernels.cc
@@ -3,52 +3,43 @@
 
 namespace FlexFlow::Kernels::Replicate {
 
-template <typename T>
-void cpu_replicate_backward_kernel(T *input,
-                                   T const *output,
-                                   size_t num_elements,
-                                   size_t num_replicas) {
-  for (size_t i = 0; i < num_elements; i++) {
-    T sum = 0;
-    for (size_t j = 0; j < num_replicas; j++) {
-      sum += output[i + j * num_elements];
-    }
-    input[i] = sum;
-  }
-}
-
-template <DataType T>
+template <DataType DT>
 struct CPUForwardKernel {
   void operator()(GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &output) {
-    memcpy(output.get<T>(),
-           input.get<T>(),
-           input.shape.num_elements() * size_of_datatype(T));
+                  GenericTensorAccessorW &output) {
+    memcpy(output.get<DT>(),
+           input.get<DT>(),
+           input.shape.num_elements() * size_of_datatype(DT));
   }
 };
 
-template <DataType T>
+template <DataType DT>
 struct CPUBackwardKernel {
-  void operator()(GenericTensorAccessorW const &input,
-                  GenericTensorAccessorR const &output,
+  void operator()(GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW &input,
                   size_t num_replicas) {
-    cpu_replicate_backward_kernel(input.get<T>(),
-                                  output.get<T>(),
-                                  input.shape.num_elements(),
-                                  num_replicas);
+    using T = real_type_t<DT>;
+    for (size_t i = 0; i < input.shape.num_elements(); i++) {
+      T cur_sum = 0;
+      for (size_t j = 0; j < num_replicas; j++) {
+        cur_sum += output.at<DT>(i, j);
+      }
+      input.at<DT>(i) = cur_sum;
+    }
   }
 };
 
 void cpu_forward_kernel(GenericTensorAccessorR const &input,
-                        GenericTensorAccessorW const &output) {
-  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
+                        GenericTensorAccessorW &output) {
+  DataTypeDispatch1<CPUForwardKernel>{}(
+      input.data_type, input, std::ref(output));
 }
 
-void cpu_backward_kernel(GenericTensorAccessorW const &input,
-                         GenericTensorAccessorR const &output,
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW &input,
                          size_t num_replicas) {
   DataTypeDispatch1<CPUBackwardKernel>{}(
-      input.data_type, input, output, num_replicas);
+      input.data_type, output, std::ref(input), num_replicas);
 }
 
 } // namespace FlexFlow::Kernels::Replicate
diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc
index afa92b307c..bc114c4e60 100644
--- a/lib/kernels/src/cpu/reverse_kernels.cc
+++ b/lib/kernels/src/cpu/reverse_kernels.cc
@@ -1,5 +1,5 @@
-#include "kernels/reverse_kernels_cpu.h"
 #include "kernels/datatype_dispatch.h"
+#include "kernels/reverse_kernels_cpu.h"
 #include <algorithm>
 #include <vector>
 
@@ -8,31 +8,20 @@ namespace FlexFlow::Kernels::Reverse {
 template <DataType DT>
 struct CPUReverseForwardKernel {
   void operator()(GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW &output,
-                  coord_t num_out_blks,
-                  coord_t reverse_dim_size,
-                  coord_t in_blk_size) {
+                  GenericTensorAccessorW &output) {
     assert(input.data_type == DT && output.data_type == DT);
 
-    // For each output block, copy the input block
-    for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) {
-      for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) {
-        for (coord_t i = 0; i < in_blk_size; ++i) {
-          output.at<DT>(blk_idx, rev_idx, i) =
-              input.at<DT>(blk_idx, rev_idx, i);
-        }
-      }
-    }
-
-    // Reverse the blocks within each output block
-    for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) {
-      for (coord_t rev_idx = 0; rev_idx < reverse_dim_size / 2; ++rev_idx) {
-        coord_t start_idx = rev_idx;
-        coord_t end_idx = reverse_dim_size - 1 - rev_idx;
-
-        for (coord_t i = 0; i < in_blk_size; ++i) {
-          std::swap(output.at<DT>(blk_idx, start_idx, i),
-                    output.at<DT>(blk_idx, end_idx, i));
+    coord_t num_out_blocks = input.shape.at(legion_dim_t(0));
+    coord_t reverse_dim_size = input.shape.at(legion_dim_t(1));
+    coord_t in_block_size = input.shape.at(legion_dim_t(2));
+
+    for (coord_t block_idx = 0; block_idx < num_out_blocks; block_idx++) {
+      for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) {
+        for (coord_t i = 0; i < in_block_size; i++) {
+          output.at<DT>(block_idx, rev_idx, i) =
+              input.at<DT>(num_out_blocks - 1 - block_idx,
+                           reverse_dim_size - 1 - rev_idx,
+                           in_block_size - 1 - i);
         }
       }
     }
@@ -40,29 +29,15 @@ struct CPUReverseForwardKernel {
 };
 
 void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
-                        GenericTensorAccessorW &output_accessor,
-                        coord_t num_out_blks,
-                        coord_t reverse_dim_size,
-                        coord_t in_blk_size) {
-  DataTypeDispatch1<CPUReverseForwardKernel>{}(input_accessor.data_type,
-                                               input_accessor,
-                                               std::ref(output_accessor),
-                                               num_out_blks,
-                                               reverse_dim_size,
-                                               in_blk_size);
+                        GenericTensorAccessorW &output_accessor) {
+  DataTypeDispatch1<CPUReverseForwardKernel>{}(
+      input_accessor.data_type, input_accessor, std::ref(output_accessor));
 }
 
 void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor,
-                         GenericTensorAccessorW &input_accessor,
-                         coord_t num_out_blks,
-                         coord_t reverse_dim_size,
-                         coord_t in_blk_size) {
-  DataTypeDispatch1<CPUReverseForwardKernel>{}(output_accessor.data_type,
-                                               output_accessor,
-                                               std::ref(input_accessor),
-                                               num_out_blks,
-                                               reverse_dim_size,
-                                               in_blk_size);
+                         GenericTensorAccessorW &input_accessor) {
+  DataTypeDispatch1<CPUReverseForwardKernel>{}(
+      output_accessor.data_type, output_accessor, std::ref(input_accessor));
 }
 
 } // namespace FlexFlow::Kernels::Reverse
diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu
index 76bfbe2658..1aa61375f0 100644
--- a/lib/kernels/src/cuda/ops/replicate_kernels.cu
+++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu
@@ -50,8 +50,8 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  GenericTensorAccessorW const &input,
                   GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input,
                   size_t num_replicas) {
     size_t total_elements = input.shape.num_elements() * num_replicas;
     replicate_backward_kernel<real_type_t<T>>
@@ -70,11 +70,11 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input,
                      size_t num_replicas) {
   DataTypeDispatch1<BackwardKernel>{}(
-      input.data_type, stream, input, output, num_replicas);
+      input.data_type, stream, output, input, num_replicas);
 }
 
 } // namespace Replicate
diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu
index f73c57dedf..8e93fec0d6 100644
--- a/lib/kernels/src/cuda/ops/reverse_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu
@@ -27,6 +27,7 @@ namespace Reverse {
 //                                        coord_t reverse_dim_size,
 //                                        coord_t in_blk_size) {
 //   CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
+//     coord_t out_idx = i;
 //     coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
 //     i = i - blk_idx * (reverse_dim_size * in_blk_size);
 //     coord_t reverse_dim_idx = i / in_blk_size;
@@ -34,8 +35,18 @@ namespace Reverse {
 //     coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
 //                      (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size +
 //                      i;
-//     out_ptr[i] = in_ptr[in_idx];
+//     out_ptr[out_idx] = in_ptr[in_idx];
 //   }
+// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
+//   coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
+//   i = i - blk_idx * (reverse_dim_size * in_blk_size);
+//   coord_t reverse_dim_idx = i / in_blk_size;
+//   i = i - reverse_dim_idx * in_blk_size;
+//   coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
+//                    (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size +
+//                    i;
+//   out_ptr[i] = in_ptr[in_idx];
+// }
 // }
 
 /* I mentioned this earlier, but I still think the reverse_forward_kernel code
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index c4a3f7bd50..023233ecb0 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -47,16 +47,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         make_tensor_shape_from_legion_dims({state.weightSize}, DataType::FLOAT);
 
     GenericTensorAccessorW query_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(query_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(query_shape, allocator);
     GenericTensorAccessorW key_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(key_shape, allocator);
+        create_random_filled_accessor_w(key_shape, allocator);
     GenericTensorAccessorW value_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(value_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(value_shape, allocator);
     GenericTensorAccessorW weight_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(weight_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(weight_shape, allocator);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW output_accessor =
@@ -76,20 +73,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW query_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(query_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(query_shape, allocator);
       GenericTensorAccessorW key_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(key_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(key_shape, allocator);
       GenericTensorAccessorW value_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(value_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(value_shape, allocator);
       GenericTensorAccessorW weight_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(weight_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(weight_shape, allocator);
       GenericTensorAccessorW output_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(output_shape, allocator);
 
       Kernels::MultiHeadAttention::backward_kernel(
           managed_stream.raw_stream(),
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index bb9c4c07bd..8a11a069f5 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -29,14 +29,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT);
 
     GenericTensorAccessorW a_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(input_shape_a,
-                                                         allocator);
+        create_random_filled_accessor_w(input_shape_a, allocator);
     GenericTensorAccessorW b_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(input_shape_b,
-                                                         allocator);
+        create_random_filled_accessor_w(input_shape_b, allocator);
     GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(output_shape, allocator);
 
     SUBCASE("forward_kernel") {
       Kernels::BatchMatmul::forward_kernel(managed_stream.raw_stream(),
@@ -55,8 +52,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW o_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW a_grad_accessor =
           allocator.allocate_tensor(input_shape_a);
       GenericTensorAccessorW b_grad_accessor =
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index 43bcc5528a..03a3a1ad40 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -36,8 +36,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         {output_n, output_c, output_h, output_w}, DataType::FLOAT);
 
     GenericTensorAccessorW input_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(input_shape, allocator);
     GenericTensorAccessorW output_accessor =
         create_random_filled_accessor_w(output_shape, allocator);
     GenericTensorAccessorW scale_accessor = create_filled_accessor_w(
@@ -59,17 +58,13 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(input_shape, allocator);
       GenericTensorAccessorW scale_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(scale_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(scale_shape, allocator);
       GenericTensorAccessorW bias_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(bias_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(bias_shape, allocator);
 
       Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(),
                                           state,
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index e9674cd167..1be5839a9c 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -75,53 +75,4 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
     }
   }
-
-  TEST_CASE("Check Cast Forward Kernel against CPU Kernel") {
-    ManagedFFStream managed_stream{};
-
-    Allocator gpu_allocator = create_local_cuda_memory_allocator();
-    Allocator cpu_allocator = create_local_cpu_memory_allocator();
-
-    TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT);
-    TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({100, 100}, DataType::INT32);
-
-    GenericTensorAccessorW output_accessor_gpu =
-        gpu_allocator.allocate_tensor(output_shape);
-    GenericTensorAccessorW output_accessor_cpu =
-        cpu_allocator.allocate_tensor(output_shape);
-
-    // Only calling forward kernel as backward kernel is exactly the same
-    SUBCASE("forward_kernel") {
-      // Run GPU Forward Kernel
-      GenericTensorAccessorW input_accessor_gpu =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           gpu_allocator);
-      Kernels::Cast::forward_kernel(
-          managed_stream.raw_stream(),
-          read_only_accessor_from_write_accessor(input_accessor_gpu),
-          output_accessor_gpu,
-          DataType::FLOAT,
-          DataType::INT32);
-
-      std::vector<int32_t> result_data_gpu =
-          load_accessor_data<DataType::INT32>(output_accessor_gpu);
-
-      // Run CPU Forward Kernel
-      GenericTensorAccessorW input_accessor_cpu =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           cpu_allocator);
-      Kernels::Cast::cpu_forward_kernel(
-          read_only_accessor_from_write_accessor(input_accessor_cpu),
-          output_accessor_cpu,
-          DataType::FLOAT,
-          DataType::INT32);
-
-      std::vector<int32_t> result_data_cpu =
-          load_accessor_data<DataType::INT32>(output_accessor_cpu);
-
-      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
-    }
-  }
 }
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 7ff364bada..4be2bdf7bb 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -47,11 +47,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_data =
-          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW input_grad_data =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(input_shape, allocator);
 
       Kernels::Dropout::backward_kernel(managed_stream.raw_stream(),
                                         state,
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 4f05c89813..7f97563217 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -41,8 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(input_shape, allocator);
 
       Kernels::Gather::backward_kernel(managed_stream.raw_stream(),
                                        state,
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 3ac0e1425f..7d7298f83d 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -56,8 +56,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(input_shape, allocator);
       GenericTensorAccessorW gamma_grad_accessor =
           allocator.allocate_tensor(feature_shape);
       GenericTensorAccessorW beta_grad_accessor =
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index f71d9cfa11..00fa968235 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -45,11 +45,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         {output_w, output_h, output_c, output_n}, DataType::FLOAT);
 
     GenericTensorAccessorW input_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(input_shape, allocator);
     GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(output_shape, allocator);
 
     SUBCASE("forward_kernel") {
       Kernels::Pool2D::forward_kernel(managed_stream.raw_stream(),
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 8af741b3a7..27223cc7b5 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -110,84 +110,4 @@ TEST_SUITE(FF_TEST_SUITE) {
                                 input_grad_accessor_cpu));
     }
   }
-
-  TEST_CASE("Check Replicate Forward Kernel against CPU Kernel") {
-    std::size_t num_replicas = 2;
-
-    TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({5}, DataType::FLOAT);
-    TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT);
-
-    ManagedPerDeviceFFHandle managed_handle{};
-    ManagedFFStream managed_stream{};
-
-    Allocator gpu_allocator = create_local_cuda_memory_allocator();
-    Allocator cpu_allocator = create_local_cpu_memory_allocator();
-
-    SUBCASE("forward_kernel") {
-      // Run GPU Replicate Forward Kernel
-      GenericTensorAccessorR input_accessor_gpu =
-          create_random_filled_accessor_r<DataType::FLOAT>(input_shape,
-                                                           gpu_allocator);
-      GenericTensorAccessorW output_accessor_gpu =
-          gpu_allocator.allocate_tensor(output_shape);
-      fill_with_zeros(output_accessor_gpu);
-
-      Kernels::Replicate::forward_kernel(
-          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
-
-      std::vector<float> result_data_gpu =
-          load_accessor_data<DataType::FLOAT>(output_accessor_gpu);
-
-      // Run CPU Replicate Forward Kernel
-      GenericTensorAccessorR input_accessor_cpu =
-          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
-      GenericTensorAccessorW output_accessor_cpu =
-          cpu_allocator.allocate_tensor(output_shape);
-      fill_with_zeros(output_accessor_cpu);
-
-      Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu,
-                                             output_accessor_cpu);
-
-      std::vector<float> result_data_cpu =
-          load_accessor_data<DataType::FLOAT>(output_accessor_cpu);
-
-      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
-    }
-
-    SUBCASE("backward_kernel") {
-      // Run GPU Replicate Backward Kernel
-      GenericTensorAccessorR output_grad_accessor_gpu =
-          create_random_filled_accessor_r<DataType::FLOAT>(output_shape,
-                                                           gpu_allocator);
-      GenericTensorAccessorW input_grad_accessor_gpu =
-          gpu_allocator.allocate_tensor(input_shape);
-      fill_with_zeros(input_grad_accessor_gpu);
-
-      Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
-                                          input_grad_accessor_gpu,
-                                          output_grad_accessor_gpu,
-                                          num_replicas);
-
-      std::vector<float> result_data_gpu =
-          load_accessor_data<DataType::FLOAT>(input_grad_accessor_gpu);
-
-      // Run CPU Replicate Backward Kernel
-      GenericTensorAccessorR output_grad_accessor_cpu =
-          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
-
-      GenericTensorAccessorW input_grad_accessor_cpu =
-          cpu_allocator.allocate_tensor(input_shape);
-      fill_with_zeros(input_grad_accessor_cpu);
-
-      Kernels::Replicate::cpu_backward_kernel(
-          input_grad_accessor_cpu, output_grad_accessor_cpu, num_replicas);
-
-      std::vector<float> result_data_cpu =
-          load_accessor_data<DataType::FLOAT>(input_grad_accessor_cpu);
-
-      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
-    }
-  }
 }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index b1f90a0a7e..4adf79847a 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -42,8 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
@@ -137,103 +136,4 @@ TEST_SUITE(FF_TEST_SUITE) {
                                 input_grad_accessor_cpu));
     }
   }
-
-  TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") {
-    std::size_t num_out_blks = 2;
-    std::size_t reverse_dim_size = 3;
-    std::size_t in_blk_size = 5;
-
-    TensorShape input_shape = make_tensor_shape_from_legion_dims(
-        {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT);
-    TensorShape output_shape = input_shape;
-
-    ManagedPerDeviceFFHandle managed_handle{};
-    ManagedFFStream managed_stream{};
-
-    Allocator gpu_allocator = create_local_cuda_memory_allocator();
-    Allocator cpu_allocator = create_local_cpu_memory_allocator();
-
-    SUBCASE("forward_kernel") {
-      auto transform = [counter = 0.0f](float val) mutable {
-        return counter++;
-      };
-
-      // Run GPU Cast Forward Kernel
-      GenericTensorAccessorR input_accessor_gpu =
-          create_random_filled_accessor_r<DataType::FLOAT>(input_shape,
-                                                           gpu_allocator);
-      GenericTensorAccessorW output_accessor_gpu =
-          gpu_allocator.allocate_tensor(output_shape);
-      fill_with_zeros(output_accessor_gpu);
-
-      Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
-                                       input_accessor_gpu.get_float_ptr(),
-                                       output_accessor_gpu.get_float_ptr(),
-                                       num_out_blks,
-                                       reverse_dim_size,
-                                       in_blk_size,
-                                       input_accessor_gpu.shape.num_elements());
-
-      std::vector<float> result_data_gpu =
-          load_accessor_data<DataType::FLOAT>(output_accessor_gpu);
-
-      // Run CPU Cast Forward Kernel
-      GenericTensorAccessorR input_accessor_cpu =
-          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
-      GenericTensorAccessorW output_accessor_cpu =
-          cpu_allocator.allocate_tensor(output_shape);
-      fill_with_zeros(output_accessor_cpu);
-
-      Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu,
-                                           output_accessor_cpu,
-                                           num_out_blks,
-                                           reverse_dim_size,
-                                           in_blk_size);
-
-      std::vector<float> result_data_cpu =
-          load_accessor_data<DataType::FLOAT>(output_accessor_cpu);
-
-      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
-    }
-
-    SUBCASE("backward_kernel") {
-      // Run GPU Cast Backward Kernel
-      GenericTensorAccessorR output_grad_accessor_gpu =
-          create_random_filled_accessor_r<DataType::FLOAT>(output_shape,
-                                                           gpu_allocator);
-      GenericTensorAccessorW input_grad_accessor_gpu =
-          gpu_allocator.allocate_tensor(input_shape);
-      fill_with_zeros(input_grad_accessor_gpu);
-
-      Kernels::Reverse::backward_kernel(
-          managed_stream.raw_stream(),
-          output_grad_accessor_gpu.get_float_ptr(),
-          input_grad_accessor_gpu.get_float_ptr(),
-          num_out_blks,
-          reverse_dim_size,
-          in_blk_size,
-          input_grad_accessor_gpu.shape.num_elements());
-
-      std::vector<float> result_data_gpu =
-          load_accessor_data<DataType::FLOAT>(input_grad_accessor_gpu);
-
-      // Run CPU Cast Backward Kernel
-      GenericTensorAccessorR output_grad_accessor_cpu =
-          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
-      GenericTensorAccessorW input_grad_accessor_cpu =
-          cpu_allocator.allocate_tensor(input_shape);
-      fill_with_zeros(input_grad_accessor_cpu);
-
-      Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu,
-                                            input_grad_accessor_cpu,
-                                            num_out_blks,
-                                            reverse_dim_size,
-                                            in_blk_size);
-
-      std::vector<float> result_data_cpu =
-          load_accessor_data<DataType::FLOAT>(input_grad_accessor_cpu);
-
-      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
-    }
-  }
 }
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index 88f24a1a08..5519c30b80 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -23,13 +23,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         managed_handle.raw_handle(), 0, input_n, channels, input_h, input_w);
 
     GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(output_shape, allocator);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(input_shape, allocator);
 
       Kernels::Softmax::forward_kernel(managed_stream.raw_stream(),
                                        state,
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index 9f1d390501..34993fa151 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -27,8 +27,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(input_shape, allocator);
 
       std::vector<float *> output_ptrs = repeat(num_outputs, [&]() {
         GenericTensorAccessorW output_accessor =
@@ -49,8 +48,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<float *> output_grad_ptrs(num_outputs);
       for (int i = 0; i < num_outputs; i++) {
         GenericTensorAccessorW output_grad_accessor =
-            create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                             allocator);
+            create_random_filled_accessor_w(output_shape, allocator);
         output_grad_ptrs[i] = output_grad_accessor.get_float_ptr();
       }
 
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index c8baaac54f..0bc85cb8e0 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -39,8 +39,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(input_shape, allocator);
 
       Kernels::Transpose::backward_kernel(managed_stream.raw_stream(),
                                           state,
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
index bfed1241ba..ca9e9e9c11 100644
--- a/lib/kernels/test/src/test_utils.cc
+++ b/lib/kernels/test/src/test_utils.cc
@@ -108,6 +108,83 @@ struct CPUAccessorRContainsNonZero {
   }
 };
 
+bool contains_non_zero(GenericTensorAccessorR const &accessor) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR cpu_accessor =
+      create_cpu_compatible_accessor_r(accessor, cpu_allocator);
+  return DataTypeDispatch1<CPUAccessorRContainsNonZero>{}(
+      cpu_accessor.data_type, cpu_accessor);
+}
+
+bool contains_non_zero(GenericTensorAccessorW const &accessor) {
+  GenericTensorAccessorR r_accessor =
+      read_only_accessor_from_write_accessor(accessor);
+  return contains_non_zero(r_accessor);
+}
+
+GenericTensorAccessorR
+    create_cpu_compatible_accessor_r(GenericTensorAccessorR const &accessor,
+                                     Allocator &cpu_allocator) {
+  GenericTensorAccessorR cpu_accessor = accessor;
+  if (accessor.device_type == DeviceType::GPU) {
+    cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator);
+  }
+  return cpu_accessor;
+}
+
+GenericTensorAccessorW
+    create_cpu_compatible_accessor_w(GenericTensorAccessorW const &accessor,
+                                     Allocator &cpu_allocator) {
+  GenericTensorAccessorW cpu_accessor = accessor;
+  if (accessor.device_type == DeviceType::GPU) {
+    cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator);
+  }
+  return cpu_accessor;
+}
+
+template <DataType DT>
+struct PrintCPUAccessorR {
+  void operator()(GenericTensorAccessorR const &accessor) {
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = accessor.get<DT>();
+    for (size_t i = 0; i < accessor.shape.num_elements(); i++) {
+      std::cout << data_ptr[i] << " ";
+    }
+    std::cout << "\n";
+  }
+};
+
+void print_accessor(GenericTensorAccessorR const &accessor) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR cpu_accessor =
+      create_cpu_compatible_accessor_r(accessor, cpu_allocator);
+  DataTypeDispatch1<PrintCPUAccessorR>{}(accessor.data_type, accessor);
+}
+
+void print_accessor(GenericTensorAccessorW const &accessor) {
+  GenericTensorAccessorR r_accessor =
+      read_only_accessor_from_write_accessor(accessor);
+  print_accessor(r_accessor);
+}
+
+template <DataType DT>
+struct CPUAccessorRContainsNonZero {
+  bool operator()(GenericTensorAccessorR const &accessor) {
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = accessor.get<DT>();
+
+    for (size_t i = 0; i < accessor.shape.num_elements(); i++) {
+      if (data_ptr[i] != 0) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+};
+
 bool contains_non_zero(GenericTensorAccessorR const &accessor) {
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
   GenericTensorAccessorR cpu_accessor =
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
index a9d522b948..19599d2900 100644
--- a/lib/kernels/test/src/test_utils.h
+++ b/lib/kernels/test/src/test_utils.h
@@ -1,135 +1,59 @@
 #ifndef _FLEXFLOW_KERNELS_TEST_UTILS
 #define _FLEXFLOW_KERNELS_TEST_UTILS
 
+#include "kernels/copy_tensor_accessor.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/device.h"
 #include "kernels/local_cpu_allocator.h"
 #include "kernels/local_cuda_allocator.h"
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
+#include "op-attrs/datatype.h"
+#include "op-attrs/datatype_value.dtg.h"
 #include <doctest/doctest.h>
-#include <random>
 #include <sstream>
 #include <string>
 #include <vector>
 
 using namespace FlexFlow;
 
-template <typename DT>
-void transfer_memory(GenericTensorAccessorW dst_accessor,
-                     const DT *src,
-                     DeviceType src_device_type) {
-  size_t bytes = dst_accessor.shape.get_volume() * sizeof(DT);
-
-  DeviceType dst_device_type = dst_accessor.device_type;
-
-  if (device_on_cpu(src_device_type) && device_on_cpu(dst_device_type)) {
-    memcpy(dst_accessor.ptr, src, bytes);
-  } else if (device_on_cpu(src_device_type) && device_on_gpu(dst_device_type)) {
-    checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyHostToDevice));
-  } else if (device_on_gpu(src_device_type) && device_on_cpu(dst_device_type)) {
-    checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToHost));
-  } else {
-    checkCUDA(
-        cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToDevice));
-  }
-}
-
-template <DataType DT>
 GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
-                                                       Allocator &allocator) {
-  assert(shape.data_type == DataType::FLOAT ||
-         shape.data_type == DataType::DOUBLE);
-
-  using T = real_type_t<DT>;
-
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-
-  std::vector<T> host_data(accessor.shape.num_elements());
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_real_distribution<T> dist(-1.0, 1.0);
+                                                       Allocator &allocator);
 
-  for (auto &val : host_data) {
-    val = dist(gen);
-  }
-
-  transfer_memory(accessor, host_data.data(), DeviceType::CPU);
-
-  return accessor;
-}
-
-template <DataType DT>
 GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape,
-                                                       Allocator &allocator) {
-  using T = real_type_t<DT>;
-  GenericTensorAccessorW accessor =
-      create_random_filled_accessor_w<DT>(shape, allocator);
+                                                       Allocator &allocator);
 
-  return read_only_accessor_from_write_accessor(accessor);
-}
+GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
+                                                     Allocator &allocator);
 
-template <typename T>
-GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
-                                                Allocator &allocator,
-                                                T val) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
+TensorShape
+    make_tensor_shape_from_legion_dims(LegionOrdered<size_t> const &dims,
+                                       DataType DT);
 
-  size_t volume = accessor.shape.get_volume();
-  std::vector<T> host_data(volume, val);
+bool contains_non_zero(GenericTensorAccessorR const &accessor);
 
-  transfer_memory(accessor, host_data.data(), DeviceType::CPU);
+void fill_with_zeros(GenericTensorAccessorW const &accessor);
 
-  return accessor;
-}
+GenericTensorAccessorW
+    copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor,
+                                        Allocator &allocator);
 
-template <DataType DT>
-std::vector<real_type_t<DT>>
-    load_accessor_data(GenericTensorAccessorR accessor) {
-  using T = real_type_t<DT>;
-
-  int volume = accessor.shape.get_volume();
-  std::vector<T> local_data(volume);
-  T const *src_ptr = accessor.get<DT>();
-
-  if (device_on_cpu(accessor.device_type)) {
-    memcpy(local_data.data(), src_ptr, volume * sizeof(T));
-  } else {
-    checkCUDA(cudaMemcpy(local_data.data(),
-                         src_ptr,
-                         volume * sizeof(T),
-                         cudaMemcpyDeviceToHost));
-  }
+GenericTensorAccessorR
+    copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor,
+                                        Allocator &allocator);
 
-  return local_data;
-}
+void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor);
 
-template <DataType DT>
-std::vector<real_type_t<DT>>
-    load_accessor_data(GenericTensorAccessorW accessor) {
-  using T = real_type_t<DT>;
-
-  int volume = accessor.shape.get_volume();
-  std::vector<T> local_data(volume);
-  T const *src_ptr = accessor.get<DT>();
-
-  if (device_on_cpu(accessor.device_type)) {
-    memcpy(local_data.data(), src_ptr, volume * sizeof(T));
-  } else {
-    checkCUDA(cudaMemcpy(local_data.data(),
-                         src_ptr,
-                         volume * sizeof(T),
-                         cudaMemcpyDeviceToHost));
-  }
+bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
+                         GenericTensorAccessorR const &accessor_b);
 
-  return local_data;
-}
+GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val);
 
-template <typename T>
-bool contains_non_zero(std::vector<T> &data) {
-  return !all_of(
-      data.begin(), data.end(), [](T const &val) { return val == 0; });
-}
+GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val);
 
 template <typename T, typename Func>
 std::vector<T> repeat(std::size_t n, Func &&func) {
diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc
index f61ed7bc7b..5d099c6b46 100644
--- a/lib/local-execution/src/local_task_argument_accessor.cc
+++ b/lib/local-execution/src/local_task_argument_accessor.cc
@@ -24,11 +24,8 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
   auto tensor_backing = std::get<GenericTensorAccessorW>(
       this->tensor_slots_backing.at(slot_grad_pair));
   if (priv == Permissions::RO) {
-    GenericTensorAccessorR readonly_tensor_backing = {
-        tensor_backing.data_type,
-        tensor_backing.shape,
-        tensor_backing.ptr,
-        this->allocator.get_allocation_device_type()};
+    GenericTensorAccessorR readonly_tensor_backing =
+        read_only_accessor_from_write_accessor(tensor_backing);
     return readonly_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
     return tensor_backing;
@@ -47,10 +44,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
     for (GenericTensorAccessorW const &tensor_backing :
          variadic_tensor_backing) {
       readonly_variadic_tensor_backing.push_back(
-          {tensor_backing.data_type,
-           tensor_backing.shape,
-           tensor_backing.ptr,
-           this->allocator.get_allocation_device_type()});
+          read_only_accessor_from_write_accessor(tensor_backing));
     }
     return readonly_variadic_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc
index 135475a711..56bbfdd371 100644
--- a/lib/local-execution/src/ops/replicate.cc
+++ b/lib/local-execution/src/ops/replicate.cc
@@ -67,8 +67,8 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[replicate] backward_time = {:.2lf}ms\n",
-                 input_grad,
                  output_grad,
+                 input_grad,
                  attrs.replicate_degree);
 }
 

From 8188afe1e8e0149bb9685dcd15c65bdc0a23a27c Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Mon, 14 Oct 2024 23:41:17 -0700
Subject: [PATCH 12/42] formatting

---
 .envrc | 3 +++
 .vimrc | 8 ++++++++
 2 files changed, 11 insertions(+)
 create mode 100644 .envrc
 create mode 100644 .vimrc

diff --git a/.envrc b/.envrc
new file mode 100644
index 0000000000..2797f0f929
--- /dev/null
+++ b/.envrc
@@ -0,0 +1,3 @@
+source_up_if_exists
+
+use flake
diff --git a/.vimrc b/.vimrc
new file mode 100644
index 0000000000..4c8a8a8279
--- /dev/null
+++ b/.vimrc
@@ -0,0 +1,8 @@
+" example search path configuration
+set path=lib/runtime/**,lib/**
+
+" set build target
+" let g:target = "pcg"
+
+" set test target
+" let g:test_target = "utils-test"

From a13255bacacb463fefdbc0d27a775d5828668a8e Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Mon, 14 Oct 2024 23:55:20 -0700
Subject: [PATCH 13/42] comment removal reverse_kernels

---
 lib/kernels/src/cuda/ops/reverse_kernels.cu | 48 +++------------------
 1 file changed, 6 insertions(+), 42 deletions(-)

diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu
index 8e93fec0d6..2c25293c36 100644
--- a/lib/kernels/src/cuda/ops/reverse_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu
@@ -17,44 +17,9 @@
 #include "kernels/reverse_kernels.h"
 
 namespace FlexFlow {
-
 namespace Kernels {
 namespace Reverse {
 
-// __global__ void reverse_forward_kernel(float const *in_ptr,
-//                                        float *out_ptr,
-//                                        coord_t num_out_blks,
-//                                        coord_t reverse_dim_size,
-//                                        coord_t in_blk_size) {
-//   CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
-//     coord_t out_idx = i;
-//     coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
-//     i = i - blk_idx * (reverse_dim_size * in_blk_size);
-//     coord_t reverse_dim_idx = i / in_blk_size;
-//     i = i - reverse_dim_idx * in_blk_size;
-//     coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
-//                      (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size +
-//                      i;
-//     out_ptr[out_idx] = in_ptr[in_idx];
-//   }
-// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
-//   coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
-//   i = i - blk_idx * (reverse_dim_size * in_blk_size);
-//   coord_t reverse_dim_idx = i / in_blk_size;
-//   i = i - reverse_dim_idx * in_blk_size;
-//   coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
-//                    (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size +
-//                    i;
-//   out_ptr[i] = in_ptr[in_idx];
-// }
-// }
-
-/* I mentioned this earlier, but I still think the reverse_forward_kernel code
-   is incorrect, even though it matches the code in inference/master? Whenever
-   I'm testing the code and printing out the output, I'm getting unexpected
-   outputs, and I think it's a result of modifying the loop index i in the
-   previous code?
-*/
 __global__ void reverse_forward_kernel(float const *in_ptr,
                                        float *out_ptr,
                                        coord_t num_out_blks,
@@ -62,13 +27,12 @@ __global__ void reverse_forward_kernel(float const *in_ptr,
                                        coord_t in_blk_size) {
   CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
     coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
-    coord_t idx_within_blk = i % (reverse_dim_size * in_blk_size);
-    coord_t reverse_dim_idx = idx_within_blk / in_blk_size;
-    coord_t in_idx = idx_within_blk % in_blk_size;
-    coord_t input_index =
-        blk_idx * (reverse_dim_size * in_blk_size) +
-        (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + in_idx;
-    out_ptr[i] = in_ptr[input_index];
+    i = i - blk_idx * (reverse_dim_size * in_blk_size);
+    coord_t reverse_dim_idx = i / in_blk_size;
+    i = i - reverse_dim_idx * in_blk_size;
+    coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
+                     (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i;
+    out_ptr[i] = in_ptr[in_idx];
   }
 }
 

From 7ed56247a30b41f1791c66fc8a4544507a383103 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Tue, 15 Oct 2024 19:22:55 -0700
Subject: [PATCH 14/42] Issue #1435, tests for managed stream and handle

---
 lib/kernels/src/managed_ff_stream.cc          | 19 +++++++----
 .../src/managed_per_device_ff_handle.cc       | 33 +++++++++++-------
 .../test/src/test_managed_ff_stream.cc        | 29 ++++++++++++++++
 .../src/test_managed_per_device_ff_handle.cc  | 34 +++++++++++++++++++
 4 files changed, 97 insertions(+), 18 deletions(-)
 create mode 100644 lib/kernels/test/src/test_managed_ff_stream.cc
 create mode 100644 lib/kernels/test/src/test_managed_per_device_ff_handle.cc

diff --git a/lib/kernels/src/managed_ff_stream.cc b/lib/kernels/src/managed_ff_stream.cc
index 7385b6cc3e..a8b44dc1d3 100644
--- a/lib/kernels/src/managed_ff_stream.cc
+++ b/lib/kernels/src/managed_ff_stream.cc
@@ -1,28 +1,35 @@
 #include "kernels/managed_ff_stream.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
 ManagedFFStream::ManagedFFStream() : stream(new ffStream_t) {
-  checkCUDA(cudaStreamCreate(stream));
+  checkCUDA(cudaStreamCreate(this->stream));
 }
 
 ManagedFFStream::ManagedFFStream(ManagedFFStream &&other) noexcept
     : stream(std::exchange(other.stream, nullptr)) {}
 
 ManagedFFStream &ManagedFFStream::operator=(ManagedFFStream &&other) noexcept {
-  std::swap(this->stream, other.stream);
+  if (this != &other) {
+    if (this->stream != nullptr) {
+      checkCUDA(cudaStreamDestroy(*this->stream));
+      delete stream;
+    }
+    this->stream = std::exchange(other.stream, nullptr);
+  }
   return *this;
 }
 
 ManagedFFStream::~ManagedFFStream() {
-  if (stream != nullptr) {
-    checkCUDA(cudaStreamDestroy(*stream));
-    delete stream;
+  if (this->stream != nullptr) {
+    checkCUDA(cudaStreamDestroy(*this->stream));
+    delete this->stream;
   }
 }
 
 ffStream_t const &ManagedFFStream::raw_stream() const {
-  return *stream;
+  return *this->stream;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc
index c050e887b6..ca105f9bc9 100644
--- a/lib/kernels/src/managed_per_device_ff_handle.cc
+++ b/lib/kernels/src/managed_per_device_ff_handle.cc
@@ -4,13 +4,13 @@
 namespace FlexFlow {
 
 ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() {
-  handle = new PerDeviceFFHandle;
-  handle->workSpaceSize = 1024 * 1024;
-  handle->allowTensorOpMathConversion = true;
+  this->handle = new PerDeviceFFHandle;
+  this->handle->workSpaceSize = 1024 * 1024;
+  this->handle->allowTensorOpMathConversion = true;
 
-  checkCUDNN(cudnnCreate(&handle->dnn));
-  checkCUBLAS(cublasCreate(&handle->blas));
-  checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize));
+  checkCUDNN(cudnnCreate(&this->handle->dnn));
+  checkCUBLAS(cublasCreate(&this->handle->blas));
+  checkCUDA(cudaMalloc(&this->handle->workSpace, this->handle->workSpaceSize));
 }
 
 ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
@@ -19,16 +19,25 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
 
 ManagedPerDeviceFFHandle &ManagedPerDeviceFFHandle::operator=(
     ManagedPerDeviceFFHandle &&other) noexcept {
-  std::swap(this->handle, other.handle);
+  if (this != &other) {
+    if (this->handle != nullptr) {
+      checkCUDNN(cudnnDestroy(this->handle->dnn));
+      checkCUBLAS(cublasDestroy(this->handle->blas));
+      checkCUDA(cudaFree(this->handle->workSpace));
+      delete this->handle;
+    }
+    this->handle = std::exchange(other.handle, nullptr);
+  }
   return *this;
 }
 
 ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() {
-  if (handle != nullptr) {
-    checkCUDNN(cudnnDestroy(handle->dnn));
-    checkCUBLAS(cublasDestroy(handle->blas));
-    checkCUDA(cudaFree(handle->workSpace));
-    delete handle;
+  if (this->handle != nullptr) {
+    checkCUDNN(cudnnDestroy(this->handle->dnn));
+    checkCUBLAS(cublasDestroy(this->handle->blas));
+    checkCUDA(cudaFree(this->handle->workSpace));
+    delete this->handle;
+    this->handle = nullptr;
   }
 }
 
diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
new file mode 100644
index 0000000000..1dc40f0a92
--- /dev/null
+++ b/lib/kernels/test/src/test_managed_ff_stream.cc
@@ -0,0 +1,29 @@
+#include "doctest/doctest.h"
+#include "kernels/managed_ff_stream.h"
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Test Managed FF Stream") {
+    ManagedFFStream base_stream{}; 
+
+    SUBCASE("Test ManagedFFStream Move Constructor") {
+      ffStream_t const *base_stream_ptr = &base_stream.raw_stream();
+      
+      ManagedFFStream new_stream(std::move(base_stream));
+
+      CHECK(&base_stream.raw_stream() == nullptr);
+      CHECK(&new_stream.raw_stream() == base_stream_ptr);
+    }
+
+    SUBCASE("Test ManagedFFStream Assignment Operator") {
+      ffStream_t const *base_stream_ptr = &base_stream.raw_stream();
+
+      ManagedFFStream new_stream{};
+      new_stream = std::move(base_stream);
+      
+      CHECK(&base_stream.raw_stream() == nullptr);
+      CHECK(&new_stream.raw_stream() == base_stream_ptr);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
new file mode 100644
index 0000000000..d99d375a7c
--- /dev/null
+++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
@@ -0,0 +1,34 @@
+#include "doctest/doctest.h"
+#include "kernels/managed_per_device_ff_handle.h"
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Test Managed Per Device FF Handle") {
+    ManagedPerDeviceFFHandle base_handle{}; 
+
+    SUBCASE("Test ManagedPerDeviceFFHandle Constructor") {
+      CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024);
+      CHECK(base_handle.raw_handle().allowTensorOpMathConversion == true);
+    }
+
+    SUBCASE("Test ManagedPerDeviceFFHandle Move Constructor") {
+      PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle();
+      
+      ManagedPerDeviceFFHandle new_handle(std::move(base_handle));
+
+      CHECK(&base_handle.raw_handle() == nullptr);
+      CHECK(&new_handle.raw_handle() == base_handle_ptr);
+    }
+
+    SUBCASE("Test ManagedPerDeviceFFHandle Assignment Operator") {
+      PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle();
+
+      ManagedPerDeviceFFHandle new_handle{};
+      new_handle = std::move(base_handle);
+      
+      CHECK(&base_handle.raw_handle() == nullptr);
+      CHECK(&new_handle.raw_handle() == base_handle_ptr);
+    }
+  }
+}

From c1758c08a43b72f062faa599468c9d743a6cf318 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Tue, 15 Oct 2024 19:25:18 -0700
Subject: [PATCH 15/42] #1435 formatting

---
 lib/kernels/test/src/test_managed_ff_stream.cc            | 6 +++---
 lib/kernels/test/src/test_managed_per_device_ff_handle.cc | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
index 1dc40f0a92..1dedb0c41d 100644
--- a/lib/kernels/test/src/test_managed_ff_stream.cc
+++ b/lib/kernels/test/src/test_managed_ff_stream.cc
@@ -5,11 +5,11 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Managed FF Stream") {
-    ManagedFFStream base_stream{}; 
+    ManagedFFStream base_stream{};
 
     SUBCASE("Test ManagedFFStream Move Constructor") {
       ffStream_t const *base_stream_ptr = &base_stream.raw_stream();
-      
+
       ManagedFFStream new_stream(std::move(base_stream));
 
       CHECK(&base_stream.raw_stream() == nullptr);
@@ -21,7 +21,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       ManagedFFStream new_stream{};
       new_stream = std::move(base_stream);
-      
+
       CHECK(&base_stream.raw_stream() == nullptr);
       CHECK(&new_stream.raw_stream() == base_stream_ptr);
     }
diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
index d99d375a7c..e85cfd61c7 100644
--- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
+++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
@@ -5,7 +5,7 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Managed Per Device FF Handle") {
-    ManagedPerDeviceFFHandle base_handle{}; 
+    ManagedPerDeviceFFHandle base_handle{};
 
     SUBCASE("Test ManagedPerDeviceFFHandle Constructor") {
       CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024);
@@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("Test ManagedPerDeviceFFHandle Move Constructor") {
       PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle();
-      
+
       ManagedPerDeviceFFHandle new_handle(std::move(base_handle));
 
       CHECK(&base_handle.raw_handle() == nullptr);
@@ -26,7 +26,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       ManagedPerDeviceFFHandle new_handle{};
       new_handle = std::move(base_handle);
-      
+
       CHECK(&base_handle.raw_handle() == nullptr);
       CHECK(&new_handle.raw_handle() == base_handle_ptr);
     }

From 54b3888eb36776eb3d99901463777c4d592ee064 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Tue, 15 Oct 2024 20:24:27 -0700
Subject: [PATCH 16/42] #1409 issue, change datatype for linear kernels away
 from void *

---
 lib/kernels/include/kernels/linear_kernels.h | 22 +++---
 lib/kernels/src/cuda/ops/linear_kernels.cu   | 76 +++++++++++---------
 lib/local-execution/src/ops/linear.cc        | 14 ++--
 3 files changed, 59 insertions(+), 53 deletions(-)

diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h
index 99549adece..cff6563629 100644
--- a/lib/kernels/include/kernels/linear_kernels.h
+++ b/lib/kernels/include/kernels/linear_kernels.h
@@ -50,23 +50,23 @@ bool use_activation(Activation activation);
 
 void forward_kernel(ffStream_t stream,
                     LinearPerDeviceState const &m,
-                    void const *input_ptr,
-                    void *output_ptr,
-                    void const *filter_ptr,
-                    void const *bias_ptr,
+                    float const *input_ptr,
+                    float *output_ptr,
+                    float const *filter_ptr,
+                    float const *bias_ptr,
                     int in_dim,
                     int out_dim,
                     int batch_size);
 
 void backward_kernel(ffStream_t stream,
                      LinearPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
-                     void const *output_ptr,
-                     void *output_grad_ptr,
-                     void const *kernel_ptr,
-                     void *kernel_grad_ptr,
-                     void *bias_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
+                     float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *kernel_ptr,
+                     float *kernel_grad_ptr,
+                     float *bias_ptr,
                      int in_dim,
                      int out_dim,
                      int batch_size);
diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu
index ca51f0d216..29b77fd9d9 100644
--- a/lib/kernels/src/cuda/ops/linear_kernels.cu
+++ b/lib/kernels/src/cuda/ops/linear_kernels.cu
@@ -108,10 +108,10 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle,
 
 void forward_kernel(cudaStream_t stream,
                     LinearPerDeviceState const &m,
-                    void const *input_ptr,
-                    void *output_ptr,
-                    void const *weight_ptr,
-                    void const *bias_ptr,
+                    float const *input_ptr,
+                    float *output_ptr,
+                    float const *weight_ptr,
+                    float const *bias_ptr,
                     int in_dim,
                     int out_dim,
                     int batch_size) {
@@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream,
                            batch_size,
                            in_dim,
                            &alpha,
-                           weight_ptr,
+                           (void *)weight_ptr,
                            weight_type,
                            in_dim,
-                           input_ptr,
+                           (void *)input_ptr,
                            input_type,
                            in_dim,
                            &beta,
-                           output_ptr,
+                           (void *)output_ptr,
                            output_type,
                            out_dim,
                            compute_type,
@@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream,
                              batch_size,
                              1,
                              &alpha,
-                             bias_ptr,
+                             (void *)bias_ptr,
                              weight_type,
                              1,
-                             m.one_ptr,
+                             (void *)m.one_ptr,
                              CUDA_R_32F,
                              1,
                              &alpha,
-                             output_ptr,
+                             (void *)output_ptr,
                              output_type,
                              out_dim,
                              compute_type,
@@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream,
                                       m.actiDesc,
                                       &alpha,
                                       m.outputTensor,
-                                      output_ptr,
+                                      (void *)output_ptr,
                                       &beta,
                                       m.outputTensor,
-                                      output_ptr));
+                                      (void *)output_ptr));
   } else if (m.activation == Activation::GELU) {
     size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size);
     constexpr float B = 0.7978845608028654f;   // sqrt(2.0/M_PI)
@@ -191,13 +191,13 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      LinearPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
-                     void const *output_ptr,
-                     void *output_grad_ptr,
-                     void const *kernel_ptr,
-                     void *kernel_grad_ptr,
-                     void *bias_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
+                     float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *kernel_ptr,
+                     float *kernel_grad_ptr,
+                     float *bias_grad_ptr,
                      int in_dim,
                      int out_dim,
                      int batch_size) {
@@ -216,11 +216,17 @@ void backward_kernel(cudaStream_t stream,
   int output_size = out_dim * batch_size;
   if (m.activation.has_value()) {
     if (m.activation == Activation::RELU) {
-      relu_backward_kernel(
-          m.output_type, output_grad_ptr, output_ptr, output_size, stream);
+      relu_backward_kernel(m.output_type,
+                           (void *)output_grad_ptr,
+                           (void *)output_ptr,
+                           output_size,
+                           stream);
     } else if (m.activation == Activation::SIGMOID) {
-      sigmoid_backward_kernel(
-          m.output_type, output_grad_ptr, output_ptr, output_size, stream);
+      sigmoid_backward_kernel(m.output_type,
+                              (void *)output_grad_ptr,
+                              (void *)output_ptr,
+                              output_size,
+                              stream);
     } else {
       // TODO: only support relu and sigmoid for now
       assert(false && "Unsupported activation for Linear");
@@ -235,14 +241,14 @@ void backward_kernel(cudaStream_t stream,
                            out_dim,
                            batch_size,
                            &alpha,
-                           input_ptr,
+                           (void *)input_ptr,
                            input_type,
                            in_dim,
-                           output_grad_ptr,
+                           (void *)output_grad_ptr,
                            output_type,
                            out_dim,
                            &alpha,
-                           kernel_grad_ptr,
+                           (void *)kernel_grad_ptr,
                            weight_type,
                            in_dim,
                            compute_type,
@@ -261,12 +267,12 @@ void backward_kernel(cudaStream_t stream,
                               in_dim,
                               out_dim,
                               &alpha,
-                              (float *)kernel_grad_ptr,
+                              kernel_grad_ptr,
                               in_dim,
                               &lambda,
-                              (float *)kernel_ptr,
+                              kernel_ptr,
                               in_dim,
-                              (float *)kernel_grad_ptr,
+                              kernel_grad_ptr,
                               in_dim));
     } else {
       assert(false && "Only L2 regularization is supported");
@@ -284,14 +290,14 @@ void backward_kernel(cudaStream_t stream,
                              out_dim,
                              batch_size,
                              &alpha,
-                             m.one_ptr,
+                             (void *)m.one_ptr,
                              CUDA_R_32F,
                              1,
-                             output_grad_ptr,
+                             (void *)output_grad_ptr,
                              output_type,
                              out_dim,
                              &alpha,
-                             bias_grad_ptr,
+                             (void *)bias_grad_ptr,
                              weight_type,
                              1,
                              compute_type,
@@ -307,14 +313,14 @@ void backward_kernel(cudaStream_t stream,
                              batch_size,
                              out_dim,
                              &alpha,
-                             kernel_ptr,
+                             (void *)kernel_ptr,
                              weight_type,
                              in_dim,
-                             output_grad_ptr,
+                             (void *)output_grad_ptr,
                              output_type,
                              out_dim,
                              &alpha,
-                             input_grad_ptr,
+                             (void *)input_grad_ptr,
                              input_type,
                              in_dim,
                              compute_type,
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
index 3e0b4672ab..4637cb388e 100644
--- a/lib/local-execution/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -148,13 +148,13 @@ static std::optional<float>
                  profiling,
                  "[Linear] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 (void *)input.get_float_ptr(),
-                 (void *)input_grad.get_float_ptr(),
-                 (void *)output.get_float_ptr(),
-                 (void *)output_grad.get_float_ptr(),
-                 (void *)weight.get_float_ptr(),
-                 (void *)weight_grad.get_float_ptr(),
-                 (void *)bias_ptr,
+                 input.get_float_ptr(),
+                 (float *)input_grad.get_float_ptr(),
+                 output.get_float_ptr(),
+                 (float *)output_grad.get_float_ptr(),
+                 weight.get_float_ptr(),
+                 (float *)weight_grad.get_float_ptr(),
+                 (float *)bias_ptr,
                  in_dim,
                  out_dim,
                  batch_size);

From 5b5c2f6e6ea5d7198a5ac693d024970380e4cf34 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Mon, 4 Nov 2024 23:12:02 -0800
Subject: [PATCH 17/42] R & W accessor changes, minimize code bloat

---
 lib/kernels/include/kernels/accessor.h        | 154 ++++++++----------
 lib/kernels/include/kernels/cast_kernels.h    |   8 +-
 .../include/kernels/cast_kernels_cpu.h        |   8 +-
 .../include/kernels/datatype_dispatch.h       |  10 +-
 .../kernels/managed_per_device_ff_handle.h    |   5 +-
 lib/kernels/src/accessor.cc                   | 107 +++---------
 lib/kernels/src/cpu/cast_kernels.cc           |  13 +-
 lib/kernels/src/cpu/replicate_kernels.cc      |   9 +-
 lib/kernels/src/cpu/reverse_kernels.cc        |  24 +--
 lib/kernels/src/cuda/ops/cast_kernels.cu      |  12 +-
 lib/kernels/src/cuda/ops/linear_kernels.cu    |  42 ++---
 .../src/managed_per_device_ff_handle.cc       |   8 +-
 .../test/src/test_managed_ff_stream.cc        |  12 +-
 .../src/test_managed_per_device_ff_handle.cc  |  14 +-
 lib/kernels/test/src/test_utils.cc            |  77 ---------
 lib/local-execution/src/ops/cast.cc           |   8 +-
 lib/local-execution/src/ops/linear.cc         |  14 +-
 .../test/src/test_local_cost_estimator.cc     |   2 +-
 18 files changed, 171 insertions(+), 356 deletions(-)

diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index 0a134db695..653c8db42d 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -13,54 +13,36 @@ namespace FlexFlow {
 
 struct Allocator;
 
-class GenericTensorAccessorW {
+class GenericTensorAccessorR {
 public:
   template <DataType DT>
-  typename data_type_enum_to_class<DT>::type *get() const {
+  typename data_type_enum_to_class<DT>::type const *get() const {
     if (this->data_type == DT) {
-      return static_cast<real_type_t<DT> *>(this->ptr);
+      return static_cast<real_type_t<DT> const *>(this->ptr);
     } else {
       throw mk_runtime_error(fmt::format(
           "Invalid access data type ({} != {})", this->data_type, DT));
     }
   }
 
-  int32_t *get_int32_ptr() const;
-  int64_t *get_int64_ptr() const;
-  float *get_float_ptr() const;
-  double *get_double_ptr() const;
-  half *get_half_ptr() const;
+  int32_t const *get_int32_ptr() const;
+  int64_t const *get_int64_ptr() const;
+  float const *get_float_ptr() const;
+  double const *get_double_ptr() const;
+  half const *get_half_ptr() const;
 
-  GenericTensorAccessorW() = delete;
+  GenericTensorAccessorR() = delete;
 
-  GenericTensorAccessorW(DataType data_type,
+  GenericTensorAccessorR(DataType data_type,
                          ArrayShape const &shape,
-                         void *ptr,
+                         void const *ptr,
                          DeviceType device_type);
 
-  bool operator==(GenericTensorAccessorW const &) const;
-  bool operator!=(GenericTensorAccessorW const &) const;
-
-  template <DataType DT, typename... Indices>
-  real_type_t<DT> &at(Indices... indices) {
-    if (this->device_type != DeviceType::CPU) {
-      throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
-    }
-    if (this->data_type != DT) {
-      throw mk_runtime_error(fmt::format(
-          "Invalid access data type ({} != {})", this->data_type, DT));
-    }
-
-    using T = real_type_t<DT>;
-
-    T *data_ptr = static_cast<T *>(this->ptr);
-    size_t offset = calculate_index_offset({static_cast<size_t>(indices)...});
-
-    return data_ptr[offset];
-  }
+  bool operator==(GenericTensorAccessorR const &) const;
+  bool operator!=(GenericTensorAccessorR const &) const;
 
-  template <DataType DT, typename... Indices>
-  real_type_t<DT> const &at(Indices... indices) const {
+  template <DataType DT>
+  real_type_t<DT> const &at(std::vector<size_t> const &indices) const {
     if (this->device_type != DeviceType::CPU) {
       throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
     }
@@ -72,7 +54,7 @@ class GenericTensorAccessorW {
     using T = real_type_t<DT>;
 
     T const *data_ptr = static_cast<T const *>(this->ptr);
-    size_t offset = calculate_index_offset({static_cast<size_t>(indices)...});
+    size_t offset = calculate_index_offset(indices);
 
     return data_ptr[offset];
   }
@@ -80,7 +62,7 @@ class GenericTensorAccessorW {
 public:
   DataType data_type;
   ArrayShape shape;
-  void *ptr;
+  void const *ptr;
   DeviceType device_type;
 
 private:
@@ -90,43 +72,62 @@ class GenericTensorAccessorW {
              decltype(device_type) const &>
       tie() const;
 
-  size_t calculate_index_offset(
-      std::initializer_list<size_t> const &indices) const;
+  size_t calculate_index_offset(std::vector<size_t> const &indices) const;
 };
 
-std::string format_as(GenericTensorAccessorW const &);
-std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
+std::string format_as(GenericTensorAccessorR const &);
+std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
 
-class GenericTensorAccessorR {
+class GenericTensorAccessorW {
 public:
   template <DataType DT>
-  typename data_type_enum_to_class<DT>::type const *get() const {
+  typename data_type_enum_to_class<DT>::type *get() const {
     if (this->data_type == DT) {
-      return static_cast<real_type_t<DT> const *>(this->ptr);
+      return static_cast<real_type_t<DT> *>(this->ptr);
     } else {
       throw mk_runtime_error(fmt::format(
           "Invalid access data type ({} != {})", this->data_type, DT));
     }
   }
 
-  int32_t const *get_int32_ptr() const;
-  int64_t const *get_int64_ptr() const;
-  float const *get_float_ptr() const;
-  double const *get_double_ptr() const;
-  half const *get_half_ptr() const;
+  int32_t *get_int32_ptr() const;
+  int64_t *get_int64_ptr() const;
+  float *get_float_ptr() const;
+  double *get_double_ptr() const;
+  half *get_half_ptr() const;
 
-  GenericTensorAccessorR() = delete;
+  GenericTensorAccessorW() = delete;
 
-  GenericTensorAccessorR(DataType data_type,
+  GenericTensorAccessorW(DataType data_type,
                          ArrayShape const &shape,
-                         void const *ptr,
+                         void *ptr,
                          DeviceType device_type);
 
-  bool operator==(GenericTensorAccessorR const &) const;
-  bool operator!=(GenericTensorAccessorR const &) const;
+  bool operator==(GenericTensorAccessorW const &) const;
+  bool operator!=(GenericTensorAccessorW const &) const;
+
+  operator GenericTensorAccessorR() const;
+
+  template <DataType DT>
+  real_type_t<DT> &at(std::vector<size_t> const &indices) {
+    if (this->device_type != DeviceType::CPU) {
+      throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
+    }
+    if (this->data_type != DT) {
+      throw mk_runtime_error(fmt::format(
+          "Invalid access data type ({} != {})", this->data_type, DT));
+    }
+
+    using T = real_type_t<DT>;
+
+    T *data_ptr = static_cast<T *>(this->ptr);
+    size_t offset = calculate_index_offset(indices);
+
+    return data_ptr[offset];
+  }
 
-  template <DataType DT, typename... Indices>
-  real_type_t<DT> const &at(Indices... indices) const {
+  template <DataType DT>
+  real_type_t<DT> &at(std::vector<size_t> const &indices) const {
     if (this->device_type != DeviceType::CPU) {
       throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
     }
@@ -138,7 +139,7 @@ class GenericTensorAccessorR {
     using T = real_type_t<DT>;
 
     T const *data_ptr = static_cast<T const *>(this->ptr);
-    size_t offset = calculate_index_offset({static_cast<size_t>(indices)...});
+    size_t offset = calculate_index_offset(indices);
 
     return data_ptr[offset];
   }
@@ -146,7 +147,7 @@ class GenericTensorAccessorR {
 public:
   DataType data_type;
   ArrayShape shape;
-  void const *ptr;
+  void *ptr;
   DeviceType device_type;
 
 private:
@@ -156,27 +157,11 @@ class GenericTensorAccessorR {
              decltype(device_type) const &>
       tie() const;
 
-  size_t calculate_index_offset(
-      std::initializer_list<size_t> const &indices) const;
+  size_t calculate_index_offset(std::vector<size_t> const &indices) const;
 };
 
-std::string format_as(GenericTensorAccessorR const &);
-std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
-
-int32_t *get_int32_ptr(GenericTensorAccessorW const &);
-int64_t *get_int64_ptr(GenericTensorAccessorW const &);
-float *get_float_ptr(GenericTensorAccessorW const &);
-double *get_double_ptr(GenericTensorAccessorW const &);
-half *get_half_ptr(GenericTensorAccessorW const &);
-std::vector<int32_t *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<int64_t *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<float *>
-    get_float_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<double *>
-    get_double_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<half *> get_half_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::string format_as(GenericTensorAccessorW const &);
+std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
 
 static_assert(is_fmtable<req<DataType> const &>::value, "");
 
@@ -241,12 +226,8 @@ std::vector<real_type_t<DT> const *>
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &write_accessor);
 
-bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
-                              GenericTensorAccessorW const &acc2);
-
-bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype);
+bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
+                              GenericTensorAccessorR const &acc2);
 
 bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
                              ArrayShape const &expected_shape,
@@ -254,16 +235,9 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
 
 std::pair<ArrayShape, DataType>
     get_shape_and_datatype(GenericTensorAccessorR const &accessor);
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorW const &accessor);
-
-void transfer_data_between_accessors(
-    GenericTensorAccessorW &dst_accessor,
-    GenericTensorAccessorR const &src_accessor);
 
-void transfer_data_between_accessors(
-    GenericTensorAccessorW &dst_accessor,
-    GenericTensorAccessorW const &src_accessor);
+void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
+                                    GenericTensorAccessorR const &src_accessor);
 
 GenericTensorAccessorR
     copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h
index f67613cec6..21e76fed1d 100644
--- a/lib/kernels/include/kernels/cast_kernels.h
+++ b/lib/kernels/include/kernels/cast_kernels.h
@@ -8,15 +8,11 @@ namespace FlexFlow::Kernels::Cast {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    DataType input_type,
-                    DataType output_type);
+                    GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output,
-                     DataType input_type,
-                     DataType output_type);
+                     GenericTensorAccessorW const &output);
 
 } // namespace FlexFlow::Kernels::Cast
 
diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h
index 959617dcae..275476b4e6 100644
--- a/lib/kernels/include/kernels/cast_kernels_cpu.h
+++ b/lib/kernels/include/kernels/cast_kernels_cpu.h
@@ -7,14 +7,10 @@
 namespace FlexFlow::Kernels::Cast {
 
 void cpu_forward_kernel(GenericTensorAccessorR const &input,
-                        GenericTensorAccessorW const &output,
-                        DataType input_type,
-                        DataType output_type);
+                        GenericTensorAccessorW const &output);
 
 void cpu_backward_kernel(GenericTensorAccessorR const &input,
-                         GenericTensorAccessorW const &output,
-                         DataType input_type,
-                         DataType output_type);
+                         GenericTensorAccessorW const &output);
 
 } // namespace FlexFlow::Kernels::Cast
 
diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h
index 0986d99791..50ca66a820 100644
--- a/lib/kernels/include/kernels/datatype_dispatch.h
+++ b/lib/kernels/include/kernels/datatype_dispatch.h
@@ -34,7 +34,7 @@ struct DataTypeDispatch1 {
     template <typename... Args,
               typename Out = decltype(std::declval<F<DataType::FLOAT>>()(
                   std::declval<Args>()...))>
-    Out operator()(Args... args) const {
+    Out operator()(Args &&...args) const {
       return F<DT>{}(std::forward<Args>(args)...);
     }
   };
@@ -42,7 +42,7 @@ struct DataTypeDispatch1 {
   template <typename... Args,
             typename Out = decltype(std::declval<F<DataType::FLOAT>>()(
                 std::declval<Args>()...))>
-  Out operator()(DataType data_type, Args... args) {
+  Out operator()(DataType data_type, Args &&...args) {
     return dispatch<Type1Dispatch>(data_type, std::forward<Args>(args)...);
   }
 };
@@ -55,13 +55,13 @@ struct DataTypeDispatch2 {
     template <DataType OT>
     struct OutputType {
       template <typename... Args>
-      void operator()(Args... args) const {
+      void operator()(Args &&...args) const {
         F<IT, OT>{}(std::forward<Args>(args)...);
       }
     };
 
     template <typename... Args>
-    void operator()(DataType output_type, Args... args) const {
+    void operator()(DataType output_type, Args &&...args) const {
       dispatch<OutputType>(output_type, std::forward<Args>(args)...);
     }
   };
@@ -69,7 +69,7 @@ struct DataTypeDispatch2 {
   template <typename... Args>
   void operator()(DataType input_data_type,
                   DataType output_data_type,
-                  Args... args) {
+                  Args &&...args) {
     dispatch<InputType>(
         input_data_type, output_data_type, std::forward<Args>(args)...);
   }
diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
index 0a83a5eecb..f9f944c6ff 100644
--- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h
+++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
@@ -7,7 +7,10 @@ namespace FlexFlow {
 
 struct ManagedPerDeviceFFHandle {
 public:
-  ManagedPerDeviceFFHandle();
+  ManagedPerDeviceFFHandle() = delete;
+
+  ManagedPerDeviceFFHandle(size_t workSpaceSize,
+                           bool allowTensorOpMathConversion);
 
   ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete;
   ManagedPerDeviceFFHandle &
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
index 9332dd6703..4cb5bd83a2 100644
--- a/lib/kernels/src/accessor.cc
+++ b/lib/kernels/src/accessor.cc
@@ -4,7 +4,7 @@
 
 namespace FlexFlow {
 
-void transfer_data_between_accessors(
+void copy_accessor_data_to_l_from_r(
     GenericTensorAccessorW &dst_accessor,
     GenericTensorAccessorR const &src_accessor) {
   size_t num_bytes = dst_accessor.shape.get_volume() *
@@ -25,6 +25,8 @@ void transfer_data_between_accessors(
     checkCUDA(cudaMemcpy(
         dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost));
   } else {
+    assert(src_device_type == DeviceType::GPU);
+    assert(src_device_type == DeviceType::CPU);
     checkCUDA(cudaMemcpy(dst_accessor.ptr,
                          src_accessor.ptr,
                          num_bytes,
@@ -32,12 +34,8 @@ void transfer_data_between_accessors(
   }
 }
 
-void transfer_data_between_accessors(
-    GenericTensorAccessorW &dst_accessor,
-    GenericTensorAccessorW const &src_accessor) {
-  GenericTensorAccessorR r_src_accessor =
-      read_only_accessor_from_write_accessor(src_accessor);
-  transfer_data_between_accessors(dst_accessor, r_src_accessor);
+GenericTensorAccessorW::operator GenericTensorAccessorR() const {
+  return read_only_accessor_from_write_accessor(*this);
 }
 
 GenericTensorAccessorW::GenericTensorAccessorW(
@@ -56,7 +54,7 @@ std::tuple<DataType const &,
 }
 
 size_t GenericTensorAccessorW::calculate_index_offset(
-    std::initializer_list<size_t> const &indices) const {
+    std::vector<size_t> const &indices) const {
 
   if (indices.size() != this->shape.num_dims()) {
     throw mk_runtime_error(fmt::format(
@@ -67,22 +65,18 @@ size_t GenericTensorAccessorW::calculate_index_offset(
 
   size_t offset = 0;
   size_t multiplier = 1;
-  size_t cur_idx;
-  auto it = indices.begin();
 
   for (size_t i = 0; i < this->shape.num_dims(); i++) {
-    cur_idx = *it++;
-
-    if (cur_idx >= this->shape.at(legion_dim_t(i))) {
+    if (indices[i] >= this->shape.at(legion_dim_t(i))) {
       throw mk_runtime_error(
           fmt::format("In {} dimension, attempting to access index {} "
                       "when only {} indexes exist",
                       i,
-                      cur_idx,
+                      indices[i],
                       this->shape.at(legion_dim_t(i))));
     }
 
-    offset += cur_idx * multiplier;
+    offset += indices[i] * multiplier;
     multiplier *= this->shape.at(legion_dim_t(i));
   }
 
@@ -146,7 +140,7 @@ std::tuple<DataType const &,
 }
 
 size_t GenericTensorAccessorR::calculate_index_offset(
-    std::initializer_list<size_t> const &indices) const {
+    std::vector<size_t> const &indices) const {
 
   if (indices.size() != this->shape.num_dims()) {
     throw mk_runtime_error(fmt::format(
@@ -155,24 +149,20 @@ size_t GenericTensorAccessorR::calculate_index_offset(
         this->shape.num_dims()));
   }
 
-  size_t offset = 0;
+  ssize_t offset = 0;
   size_t multiplier = 1;
-  size_t cur_idx;
-  auto it = indices.begin();
 
   for (size_t i = 0; i < this->shape.num_dims(); i++) {
-    cur_idx = *it++;
-
-    if (cur_idx >= this->shape.at(legion_dim_t(i))) {
+    if (indices[i] >= this->shape.at(legion_dim_t(i))) {
       throw mk_runtime_error(
           fmt::format("In {} dimension, attempting to access index {} "
                       "when only {} indexes exist",
                       i,
-                      cur_idx,
+                      indices[i],
                       this->shape.at(legion_dim_t(i))));
     }
 
-    offset += cur_idx * multiplier;
+    offset += indices[i] * multiplier;
     multiplier *= this->shape.at(legion_dim_t(i));
   }
 
@@ -220,51 +210,6 @@ std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) {
   return (s << fmt::to_string(a));
 }
 
-int32_t *get_int32_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::INT32>(a);
-}
-
-int64_t *get_int64_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::INT64>(a);
-}
-
-float *get_float_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-double *get_double_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-half *get_half_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::HALF>(a);
-}
-
-std::vector<int32_t *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::INT32>(a);
-}
-
-std::vector<int64_t *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::INT64>(a);
-}
-
-std::vector<float *>
-    get_float_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-std::vector<double *>
-    get_double_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-std::vector<half *>
-    get_half_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::HALF>(a);
-}
-
 int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) {
   return get<DataType::INT32>(a);
 }
@@ -318,18 +263,11 @@ GenericTensorAccessorR read_only_accessor_from_write_accessor(
                                 writable.device_type};
 }
 
-bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
-                              GenericTensorAccessorW const &acc2) {
+bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
+                              GenericTensorAccessorR const &acc2) {
   return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type;
 }
 
-bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype) {
-  return accessor.shape == expected_shape &&
-         accessor.data_type == expected_dtype;
-}
-
 bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
                              ArrayShape const &expected_shape,
                              DataType const &expected_dtype) {
@@ -342,11 +280,6 @@ std::pair<ArrayShape, DataType>
   return std::make_pair(accessor.shape, accessor.data_type);
 }
 
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorW const &accessor) {
-  return std::make_pair(accessor.shape, accessor.data_type);
-}
-
 template <DataType DT>
 struct CopyTensorAccessorW {
   GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor,
@@ -355,7 +288,7 @@ struct CopyTensorAccessorW {
         get_tensor_shape(src_accessor.shape, src_accessor.data_type);
     GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
 
-    transfer_data_between_accessors(dst_accessor, src_accessor);
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
 
     return dst_accessor;
   }
@@ -365,7 +298,7 @@ GenericTensorAccessorW
     copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
                            Allocator &allocator) {
   return DataTypeDispatch1<CopyTensorAccessorW>{}(
-      src_accessor.data_type, src_accessor, std::ref(allocator));
+      src_accessor.data_type, src_accessor, allocator);
 }
 
 template <DataType DT>
@@ -376,7 +309,7 @@ struct CopyTensorAccessorR {
         get_tensor_shape(src_accessor.shape, src_accessor.data_type);
     GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
 
-    transfer_data_between_accessors(dst_accessor, src_accessor);
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
 
     return read_only_accessor_from_write_accessor(dst_accessor);
   }
@@ -386,7 +319,7 @@ GenericTensorAccessorR
     copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
                            Allocator &allocator) {
   return DataTypeDispatch1<CopyTensorAccessorR>{}(
-      src_accessor.data_type, src_accessor, std::ref(allocator));
+      src_accessor.data_type, src_accessor, allocator);
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc
index 2d3f440c75..5a00503fe4 100644
--- a/lib/kernels/src/cpu/cast_kernels.cc
+++ b/lib/kernels/src/cpu/cast_kernels.cc
@@ -37,18 +37,15 @@ struct CPUBackwardKernel {
 };
 
 void cpu_forward_kernel(GenericTensorAccessorR const &input,
-                        GenericTensorAccessorW const &output,
-                        DataType input_type,
-                        DataType output_type) {
-  DataTypeDispatch2<CPUForwardKernel>{}(input_type, output_type, input, output);
+                        GenericTensorAccessorW const &output) {
+  DataTypeDispatch2<CPUForwardKernel>{}(
+      input.data_type, output.data_type, input, output);
 }
 
 void cpu_backward_kernel(GenericTensorAccessorR const &input,
-                         GenericTensorAccessorW const &output,
-                         DataType input_type,
-                         DataType output_type) {
+                         GenericTensorAccessorW const &output) {
   DataTypeDispatch2<CPUBackwardKernel>{}(
-      input_type, output_type, input, output);
+      input.data_type, output.data_type, input, output);
 }
 
 } // namespace FlexFlow::Kernels::Cast
diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc
index 683739b91e..25693b374d 100644
--- a/lib/kernels/src/cpu/replicate_kernels.cc
+++ b/lib/kernels/src/cpu/replicate_kernels.cc
@@ -22,24 +22,23 @@ struct CPUBackwardKernel {
     for (size_t i = 0; i < input.shape.num_elements(); i++) {
       T cur_sum = 0;
       for (size_t j = 0; j < num_replicas; j++) {
-        cur_sum += output.at<DT>(i, j);
+        cur_sum += output.at<DT>({i, j});
       }
-      input.at<DT>(i) = cur_sum;
+      input.at<DT>({i}) = cur_sum;
     }
   }
 };
 
 void cpu_forward_kernel(GenericTensorAccessorR const &input,
                         GenericTensorAccessorW &output) {
-  DataTypeDispatch1<CPUForwardKernel>{}(
-      input.data_type, input, std::ref(output));
+  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
 }
 
 void cpu_backward_kernel(GenericTensorAccessorR const &output,
                          GenericTensorAccessorW &input,
                          size_t num_replicas) {
   DataTypeDispatch1<CPUBackwardKernel>{}(
-      input.data_type, output, std::ref(input), num_replicas);
+      input.data_type, output, input, num_replicas);
 }
 
 } // namespace FlexFlow::Kernels::Replicate
diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc
index bc114c4e60..e5b3719d74 100644
--- a/lib/kernels/src/cpu/reverse_kernels.cc
+++ b/lib/kernels/src/cpu/reverse_kernels.cc
@@ -11,17 +11,17 @@ struct CPUReverseForwardKernel {
                   GenericTensorAccessorW &output) {
     assert(input.data_type == DT && output.data_type == DT);
 
-    coord_t num_out_blocks = input.shape.at(legion_dim_t(0));
-    coord_t reverse_dim_size = input.shape.at(legion_dim_t(1));
-    coord_t in_block_size = input.shape.at(legion_dim_t(2));
+    size_t num_out_blocks = input.shape.at(legion_dim_t(0));
+    size_t reverse_dim_size = input.shape.at(legion_dim_t(1));
+    size_t in_block_size = input.shape.at(legion_dim_t(2));
 
-    for (coord_t block_idx = 0; block_idx < num_out_blocks; block_idx++) {
-      for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) {
-        for (coord_t i = 0; i < in_block_size; i++) {
-          output.at<DT>(block_idx, rev_idx, i) =
-              input.at<DT>(num_out_blocks - 1 - block_idx,
-                           reverse_dim_size - 1 - rev_idx,
-                           in_block_size - 1 - i);
+    for (size_t block_idx = 0; block_idx < num_out_blocks; block_idx++) {
+      for (size_t rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) {
+        for (size_t i = 0; i < in_block_size; i++) {
+          output.at<DT>({block_idx, rev_idx, i}) =
+              input.at<DT>({num_out_blocks - 1 - block_idx,
+                            reverse_dim_size - 1 - rev_idx,
+                            in_block_size - 1 - i});
         }
       }
     }
@@ -31,13 +31,13 @@ struct CPUReverseForwardKernel {
 void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
                         GenericTensorAccessorW &output_accessor) {
   DataTypeDispatch1<CPUReverseForwardKernel>{}(
-      input_accessor.data_type, input_accessor, std::ref(output_accessor));
+      input_accessor.data_type, input_accessor, output_accessor);
 }
 
 void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor,
                          GenericTensorAccessorW &input_accessor) {
   DataTypeDispatch1<CPUReverseForwardKernel>{}(
-      output_accessor.data_type, output_accessor, std::ref(input_accessor));
+      output_accessor.data_type, output_accessor, input_accessor);
 }
 
 } // namespace FlexFlow::Kernels::Reverse
diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu
index b895ffb68f..dc342fd0e0 100644
--- a/lib/kernels/src/cuda/ops/cast_kernels.cu
+++ b/lib/kernels/src/cuda/ops/cast_kernels.cu
@@ -60,20 +60,16 @@ struct BackwardKernel {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    DataType input_type,
-                    DataType output_type) {
+                    GenericTensorAccessorW const &output) {
   DataTypeDispatch2<ForwardKernel>{}(
-      input_type, output_type, stream, input, output);
+      input.data_type, output.data_type, stream, input, output);
 }
 
 void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output,
-                     DataType input_type,
-                     DataType output_type) {
+                     GenericTensorAccessorW const &output) {
   DataTypeDispatch2<BackwardKernel>{}(
-      input_type, output_type, stream, input, output);
+      input.data_type, output.data_type, stream, input, output);
 }
 
 } // namespace Cast
diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu
index 29b77fd9d9..f13ebee67e 100644
--- a/lib/kernels/src/cuda/ops/linear_kernels.cu
+++ b/lib/kernels/src/cuda/ops/linear_kernels.cu
@@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream,
                            batch_size,
                            in_dim,
                            &alpha,
-                           (void *)weight_ptr,
+                           reinterpret_cast<void const *>(weight_ptr),
                            weight_type,
                            in_dim,
-                           (void *)input_ptr,
+                           reinterpret_cast<void const *>(input_ptr),
                            input_type,
                            in_dim,
                            &beta,
-                           (void *)output_ptr,
+                           reinterpret_cast<void *>(output_ptr),
                            output_type,
                            out_dim,
                            compute_type,
@@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream,
                              batch_size,
                              1,
                              &alpha,
-                             (void *)bias_ptr,
+                             reinterpret_cast<void const *>(bias_ptr),
                              weight_type,
                              1,
-                             (void *)m.one_ptr,
+                             reinterpret_cast<void const *>(m.one_ptr),
                              CUDA_R_32F,
                              1,
                              &alpha,
-                             (void *)output_ptr,
+                             reinterpret_cast<void *>(output_ptr),
                              output_type,
                              out_dim,
                              compute_type,
@@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream,
                                       m.actiDesc,
                                       &alpha,
                                       m.outputTensor,
-                                      (void *)output_ptr,
+                                      reinterpret_cast<void *>(output_ptr),
                                       &beta,
                                       m.outputTensor,
-                                      (void *)output_ptr));
+                                      reinterpret_cast<void *>(output_ptr)));
   } else if (m.activation == Activation::GELU) {
     size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size);
     constexpr float B = 0.7978845608028654f;   // sqrt(2.0/M_PI)
@@ -217,14 +217,14 @@ void backward_kernel(cudaStream_t stream,
   if (m.activation.has_value()) {
     if (m.activation == Activation::RELU) {
       relu_backward_kernel(m.output_type,
-                           (void *)output_grad_ptr,
-                           (void *)output_ptr,
+                           reinterpret_cast<void *>(output_grad_ptr),
+                           reinterpret_cast<void const *>(output_ptr),
                            output_size,
                            stream);
     } else if (m.activation == Activation::SIGMOID) {
       sigmoid_backward_kernel(m.output_type,
-                              (void *)output_grad_ptr,
-                              (void *)output_ptr,
+                              reinterpret_cast<void *>(output_grad_ptr),
+                              reinterpret_cast<void const *>(output_ptr),
                               output_size,
                               stream);
     } else {
@@ -241,14 +241,14 @@ void backward_kernel(cudaStream_t stream,
                            out_dim,
                            batch_size,
                            &alpha,
-                           (void *)input_ptr,
+                           reinterpret_cast<void const *>(input_ptr),
                            input_type,
                            in_dim,
-                           (void *)output_grad_ptr,
+                           reinterpret_cast<void *>(output_grad_ptr),
                            output_type,
                            out_dim,
                            &alpha,
-                           (void *)kernel_grad_ptr,
+                           reinterpret_cast<void *>(kernel_grad_ptr),
                            weight_type,
                            in_dim,
                            compute_type,
@@ -290,14 +290,14 @@ void backward_kernel(cudaStream_t stream,
                              out_dim,
                              batch_size,
                              &alpha,
-                             (void *)m.one_ptr,
+                             reinterpret_cast<void const *>(m.one_ptr),
                              CUDA_R_32F,
                              1,
-                             (void *)output_grad_ptr,
+                             reinterpret_cast<void *>(output_grad_ptr),
                              output_type,
                              out_dim,
                              &alpha,
-                             (void *)bias_grad_ptr,
+                             reinterpret_cast<void *>(bias_grad_ptr),
                              weight_type,
                              1,
                              compute_type,
@@ -313,14 +313,14 @@ void backward_kernel(cudaStream_t stream,
                              batch_size,
                              out_dim,
                              &alpha,
-                             (void *)kernel_ptr,
+                             reinterpret_cast<void const *>(kernel_ptr),
                              weight_type,
                              in_dim,
-                             (void *)output_grad_ptr,
+                             reinterpret_cast<void *>(output_grad_ptr),
                              output_type,
                              out_dim,
                              &alpha,
-                             (void *)input_grad_ptr,
+                             reinterpret_cast<void *>(input_grad_ptr),
                              input_type,
                              in_dim,
                              compute_type,
diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc
index ca105f9bc9..5bd49dc26f 100644
--- a/lib/kernels/src/managed_per_device_ff_handle.cc
+++ b/lib/kernels/src/managed_per_device_ff_handle.cc
@@ -3,10 +3,11 @@
 
 namespace FlexFlow {
 
-ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() {
+ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
+    size_t workSpaceSize, bool allowTensorOpMathConversion) {
   this->handle = new PerDeviceFFHandle;
-  this->handle->workSpaceSize = 1024 * 1024;
-  this->handle->allowTensorOpMathConversion = true;
+  this->handle->workSpaceSize = workSpaceSize;
+  this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion;
 
   checkCUDNN(cudnnCreate(&this->handle->dnn));
   checkCUBLAS(cublasCreate(&this->handle->blas));
@@ -37,7 +38,6 @@ ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() {
     checkCUBLAS(cublasDestroy(this->handle->blas));
     checkCUDA(cudaFree(this->handle->workSpace));
     delete this->handle;
-    this->handle = nullptr;
   }
 }
 
diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
index 1dedb0c41d..ce8a808454 100644
--- a/lib/kernels/test/src/test_managed_ff_stream.cc
+++ b/lib/kernels/test/src/test_managed_ff_stream.cc
@@ -6,24 +6,24 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Managed FF Stream") {
     ManagedFFStream base_stream{};
+    ffStream_t const *base_stream_ptr = &base_stream.raw_stream();
 
     SUBCASE("Test ManagedFFStream Move Constructor") {
-      ffStream_t const *base_stream_ptr = &base_stream.raw_stream();
-
       ManagedFFStream new_stream(std::move(base_stream));
-
       CHECK(&base_stream.raw_stream() == nullptr);
       CHECK(&new_stream.raw_stream() == base_stream_ptr);
     }
 
     SUBCASE("Test ManagedFFStream Assignment Operator") {
-      ffStream_t const *base_stream_ptr = &base_stream.raw_stream();
-
       ManagedFFStream new_stream{};
       new_stream = std::move(base_stream);
-
       CHECK(&base_stream.raw_stream() == nullptr);
       CHECK(&new_stream.raw_stream() == base_stream_ptr);
     }
+
+    SUBCASE("Test Self-Assignment") {
+      base_stream = std::move(base_stream);
+      CHECK(&base_stream.raw_stream() == base_stream_ptr);
+    }
   }
 }
diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
index e85cfd61c7..d39da03ba9 100644
--- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
+++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
@@ -5,7 +5,8 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Managed Per Device FF Handle") {
-    ManagedPerDeviceFFHandle base_handle{};
+    ManagedPerDeviceFFHandle base_handle{1024 * 1024, true};
+    PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle();
 
     SUBCASE("Test ManagedPerDeviceFFHandle Constructor") {
       CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024);
@@ -13,8 +14,6 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("Test ManagedPerDeviceFFHandle Move Constructor") {
-      PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle();
-
       ManagedPerDeviceFFHandle new_handle(std::move(base_handle));
 
       CHECK(&base_handle.raw_handle() == nullptr);
@@ -22,13 +21,16 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("Test ManagedPerDeviceFFHandle Assignment Operator") {
-      PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle();
-
-      ManagedPerDeviceFFHandle new_handle{};
+      ManagedPerDeviceFFHandle new_handle{1024 * 1024, true};
       new_handle = std::move(base_handle);
 
       CHECK(&base_handle.raw_handle() == nullptr);
       CHECK(&new_handle.raw_handle() == base_handle_ptr);
     }
+
+    SUBCASE("Test Self-Assignment") {
+      base_handle = std::move(base_handle);
+      CHECK(&base_handle.raw_handle() == base_handle_ptr);
+    }
   }
 }
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
index ca9e9e9c11..bfed1241ba 100644
--- a/lib/kernels/test/src/test_utils.cc
+++ b/lib/kernels/test/src/test_utils.cc
@@ -108,83 +108,6 @@ struct CPUAccessorRContainsNonZero {
   }
 };
 
-bool contains_non_zero(GenericTensorAccessorR const &accessor) {
-  Allocator cpu_allocator = create_local_cpu_memory_allocator();
-  GenericTensorAccessorR cpu_accessor =
-      create_cpu_compatible_accessor_r(accessor, cpu_allocator);
-  return DataTypeDispatch1<CPUAccessorRContainsNonZero>{}(
-      cpu_accessor.data_type, cpu_accessor);
-}
-
-bool contains_non_zero(GenericTensorAccessorW const &accessor) {
-  GenericTensorAccessorR r_accessor =
-      read_only_accessor_from_write_accessor(accessor);
-  return contains_non_zero(r_accessor);
-}
-
-GenericTensorAccessorR
-    create_cpu_compatible_accessor_r(GenericTensorAccessorR const &accessor,
-                                     Allocator &cpu_allocator) {
-  GenericTensorAccessorR cpu_accessor = accessor;
-  if (accessor.device_type == DeviceType::GPU) {
-    cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator);
-  }
-  return cpu_accessor;
-}
-
-GenericTensorAccessorW
-    create_cpu_compatible_accessor_w(GenericTensorAccessorW const &accessor,
-                                     Allocator &cpu_allocator) {
-  GenericTensorAccessorW cpu_accessor = accessor;
-  if (accessor.device_type == DeviceType::GPU) {
-    cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator);
-  }
-  return cpu_accessor;
-}
-
-template <DataType DT>
-struct PrintCPUAccessorR {
-  void operator()(GenericTensorAccessorR const &accessor) {
-    using T = real_type_t<DT>;
-
-    T const *data_ptr = accessor.get<DT>();
-    for (size_t i = 0; i < accessor.shape.num_elements(); i++) {
-      std::cout << data_ptr[i] << " ";
-    }
-    std::cout << "\n";
-  }
-};
-
-void print_accessor(GenericTensorAccessorR const &accessor) {
-  Allocator cpu_allocator = create_local_cpu_memory_allocator();
-  GenericTensorAccessorR cpu_accessor =
-      create_cpu_compatible_accessor_r(accessor, cpu_allocator);
-  DataTypeDispatch1<PrintCPUAccessorR>{}(accessor.data_type, accessor);
-}
-
-void print_accessor(GenericTensorAccessorW const &accessor) {
-  GenericTensorAccessorR r_accessor =
-      read_only_accessor_from_write_accessor(accessor);
-  print_accessor(r_accessor);
-}
-
-template <DataType DT>
-struct CPUAccessorRContainsNonZero {
-  bool operator()(GenericTensorAccessorR const &accessor) {
-    using T = real_type_t<DT>;
-
-    T const *data_ptr = accessor.get<DT>();
-
-    for (size_t i = 0; i < accessor.shape.num_elements(); i++) {
-      if (data_ptr[i] != 0) {
-        return true;
-      }
-    }
-
-    return false;
-  }
-};
-
 bool contains_non_zero(GenericTensorAccessorR const &accessor) {
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
   GenericTensorAccessorR cpu_accessor =
diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc
index 3e7baf49a9..e9adf88422 100644
--- a/lib/local-execution/src/ops/cast.cc
+++ b/lib/local-execution/src/ops/cast.cc
@@ -54,9 +54,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  profiling,
                  "[Cast] forward_time = {:.2lf}ms\n",
                  input,
-                 output,
-                 input.data_type,
-                 attrs.dtype);
+                 output);
 }
 
 static std::optional<float>
@@ -73,9 +71,7 @@ static std::optional<float>
                  profiling,
                  "[Cast] forward_time = {:.2lf}ms\n",
                  input_grad,
-                 output_grad,
-                 input.data_type,
-                 attrs.dtype);
+                 output_grad);
 }
 
 TaskImplFunction get_cast_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
index 4637cb388e..fd2c1cd5e4 100644
--- a/lib/local-execution/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -125,17 +125,17 @@ static std::optional<float>
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto bias = acc.get_tensor<Permissions::RO>(BIAS);
+  auto bias = acc.get_tensor<Permissions::RW>(BIAS);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RW>(OUTPUT);
   auto per_device_state =
       acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
 
-  float const *bias_ptr = NULL;
+  float *bias_ptr = NULL;
   if (attrs.use_bias) {
     bias_ptr = bias.get_float_ptr();
   }
@@ -149,12 +149,12 @@ static std::optional<float>
                  "[Linear] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
-                 (float *)input_grad.get_float_ptr(),
+                 input_grad.get_float_ptr(),
                  output.get_float_ptr(),
-                 (float *)output_grad.get_float_ptr(),
+                 output_grad.get_float_ptr(),
                  weight.get_float_ptr(),
-                 (float *)weight_grad.get_float_ptr(),
-                 (float *)bias_ptr,
+                 weight_grad.get_float_ptr(),
+                 bias_ptr,
                  in_dim,
                  out_dim,
                  batch_size);
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index da3af6e3ad..788ab52a7a 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -12,7 +12,7 @@
 // TEST_SUITE(FF_CUDA_TEST_SUITE) {
 //   TEST_CASE("Local Cost Estimator") {
 //     // local backing initialization
-//     ManagedPerDeviceFFHandle managed_handle{};
+//     ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true);
 
 //     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
 //         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),

From ddae36776dad2091f4b9aa79ef594c98258ed955 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Fri, 15 Nov 2024 17:09:37 -0800
Subject: [PATCH 18/42] code formatting and refactor

---
 lib/kernels/include/kernels/accessor.h        | 103 +++++++++++++----
 .../include/kernels/copy_tensor_accessor.h    |  19 ++++
 .../include/kernels/managed_ff_stream.h       |   2 +
 .../kernels/managed_per_device_ff_handle.h    |   2 +
 lib/kernels/src/accessor.cc                   | 104 +-----------------
 lib/kernels/src/copy_tensor_accessor.cc       |  48 ++++++++
 lib/kernels/src/cpu/replicate_kernels.cc      |   4 +-
 lib/kernels/src/cpu/reverse_kernels.cc        |  12 +-
 lib/kernels/src/cuda/ops/linear_kernels.cu    |  42 +++----
 lib/kernels/src/managed_ff_stream.cc          |   9 +-
 .../src/managed_per_device_ff_handle.cc       |  13 +--
 .../test/src/test_managed_ff_stream.cc        |  24 ++--
 .../src/test_managed_per_device_ff_handle.cc  |  26 +++--
 .../test/src/test_local_cost_estimator.cc     |   6 +-
 .../include/op-attrs/make_datatype_value.h    |  16 +++
 .../src/op-attrs/make_datatype_value.cc       |  25 +++++
 lib/pcg/src/pcg/computation_graph_builder.cc  |  25 +++--
 .../parallel_computation_graph_builder.cc     |   9 +-
 18 files changed, 285 insertions(+), 204 deletions(-)
 create mode 100644 lib/kernels/include/kernels/copy_tensor_accessor.h
 create mode 100644 lib/kernels/src/copy_tensor_accessor.cc
 create mode 100644 lib/op-attrs/include/op-attrs/make_datatype_value.h
 create mode 100644 lib/op-attrs/src/op-attrs/make_datatype_value.cc

diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index 653c8db42d..487bc1f8f0 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -11,8 +11,6 @@
 
 namespace FlexFlow {
 
-struct Allocator;
-
 class GenericTensorAccessorR {
 public:
   template <DataType DT>
@@ -42,7 +40,7 @@ class GenericTensorAccessorR {
   bool operator!=(GenericTensorAccessorR const &) const;
 
   template <DataType DT>
-  real_type_t<DT> const &at(std::vector<size_t> const &indices) const {
+  real_type_t<DT> const &at(std::vector<int> const &indices) const {
     if (this->device_type != DeviceType::CPU) {
       throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
     }
@@ -50,11 +48,31 @@ class GenericTensorAccessorR {
       throw mk_runtime_error(fmt::format(
           "Invalid access data type ({} != {})", this->data_type, DT));
     }
+    if (indices.size() != this->shape.num_dims()) {
+      throw mk_runtime_error(fmt::format("Number of indices ({}) does not "
+                                         "match the number of dimensions ({}).",
+                                         indices.size(),
+                                         this->shape.num_dims()));
+    }
 
     using T = real_type_t<DT>;
-
     T const *data_ptr = static_cast<T const *>(this->ptr);
-    size_t offset = calculate_index_offset(indices);
+
+    int offset = 0;
+    int multiplier = 1;
+    for (int i = 0; i < this->shape.num_dims(); i++) {
+      if (indices.at(i) >= this->shape.at(legion_dim_t{i})) {
+        throw mk_runtime_error(
+            fmt::format("In {} dimension, attempting to access index {} "
+                        "when only {} indexes exist",
+                        i,
+                        indices.at(i),
+                        this->shape.at(legion_dim_t{i})));
+      }
+
+      offset += indices.at(i) * multiplier;
+      multiplier *= this->shape.at(legion_dim_t{i});
+    }
 
     return data_ptr[offset];
   }
@@ -71,8 +89,6 @@ class GenericTensorAccessorR {
              decltype(ptr) const &,
              decltype(device_type) const &>
       tie() const;
-
-  size_t calculate_index_offset(std::vector<size_t> const &indices) const;
 };
 
 std::string format_as(GenericTensorAccessorR const &);
@@ -109,7 +125,7 @@ class GenericTensorAccessorW {
   operator GenericTensorAccessorR() const;
 
   template <DataType DT>
-  real_type_t<DT> &at(std::vector<size_t> const &indices) {
+  real_type_t<DT> &at(std::vector<int> const &indices) {
     if (this->device_type != DeviceType::CPU) {
       throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
     }
@@ -117,17 +133,37 @@ class GenericTensorAccessorW {
       throw mk_runtime_error(fmt::format(
           "Invalid access data type ({} != {})", this->data_type, DT));
     }
+    if (indices.size() != this->shape.num_dims()) {
+      throw mk_runtime_error(fmt::format("Number of indices ({}) does not "
+                                         "match the number of dimensions ({}).",
+                                         indices.size(),
+                                         this->shape.num_dims()));
+    }
 
     using T = real_type_t<DT>;
 
     T *data_ptr = static_cast<T *>(this->ptr);
-    size_t offset = calculate_index_offset(indices);
+    int offset = 0;
+    int multiplier = 1;
+    for (int i = 0; i < this->shape.num_dims(); i++) {
+      if (indices.at(i) >= this->shape.at(legion_dim_t{i})) {
+        throw mk_runtime_error(
+            fmt::format("In {} dimension, attempting to access index {} "
+                        "when only {} indexes exist",
+                        i,
+                        indices.at(i),
+                        this->shape.at(legion_dim_t{i})));
+      }
+
+      offset += indices.at(i) * multiplier;
+      multiplier *= this->shape.at(legion_dim_t{i});
+    }
 
     return data_ptr[offset];
   }
 
   template <DataType DT>
-  real_type_t<DT> &at(std::vector<size_t> const &indices) const {
+  real_type_t<DT> &at(std::vector<int> const &indices) const {
     if (this->device_type != DeviceType::CPU) {
       throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
     }
@@ -135,11 +171,31 @@ class GenericTensorAccessorW {
       throw mk_runtime_error(fmt::format(
           "Invalid access data type ({} != {})", this->data_type, DT));
     }
+    if (indices.size() != this->shape.num_dims()) {
+      throw mk_runtime_error(fmt::format("Number of indices ({}) does not "
+                                         "match the number of dimensions ({}).",
+                                         indices.size(),
+                                         this->shape.num_dims()));
+    }
 
     using T = real_type_t<DT>;
 
     T const *data_ptr = static_cast<T const *>(this->ptr);
-    size_t offset = calculate_index_offset(indices);
+    int offset = 0;
+    int multiplier = 1;
+    for (int i = 0; i < this->shape.num_dims(); i++) {
+      if (indices.at(i) >= this->shape.at(legion_dim_t{i})) {
+        throw mk_runtime_error(
+            fmt::format("In {} dimension, attempting to access index {} "
+                        "when only {} indexes exist",
+                        i,
+                        indices.at(i),
+                        this->shape.at(legion_dim_t{i})));
+      }
+
+      offset += indices.at(i) * multiplier;
+      multiplier *= this->shape.at(legion_dim_t{i});
+    }
 
     return data_ptr[offset];
   }
@@ -156,8 +212,6 @@ class GenericTensorAccessorW {
              decltype(ptr) const &,
              decltype(device_type) const &>
       tie() const;
-
-  size_t calculate_index_offset(std::vector<size_t> const &indices) const;
 };
 
 std::string format_as(GenericTensorAccessorW const &);
@@ -213,6 +267,21 @@ std::vector<double const *>
 std::vector<half const *>
     get_half_ptrs(std::vector<GenericTensorAccessorR> const &);
 
+int32_t *get_int32_ptr(GenericTensorAccessorW const &);
+int64_t *get_int64_ptr(GenericTensorAccessorW const &);
+float *get_float_ptr(GenericTensorAccessorW const &);
+double *get_double_ptr(GenericTensorAccessorW const &);
+half *get_half_ptr(GenericTensorAccessorW const &);
+std::vector<int32_t *>
+    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<int64_t *>
+    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<float *>
+    get_float_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<double *>
+    get_double_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<half *> get_half_ptrs(std::vector<GenericTensorAccessorW> const &);
+
 template <DataType DT>
 std::vector<real_type_t<DT> const *>
     get(std::vector<GenericTensorAccessorR> const &accs) {
@@ -239,14 +308,6 @@ std::pair<ArrayShape, DataType>
 void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
                                     GenericTensorAccessorR const &src_accessor);
 
-GenericTensorAccessorR
-    copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
-                           Allocator &allocator);
-
-GenericTensorAccessorW
-    copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
-                           Allocator &allocator);
-
 } // namespace FlexFlow
 
 namespace FlexFlow {
diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h
new file mode 100644
index 0000000000..da8af71e4f
--- /dev/null
+++ b/lib/kernels/include/kernels/copy_tensor_accessor.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H
+#define _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H
+
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+
+namespace FlexFlow {
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
+                           Allocator &allocator);
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
+                           Allocator &allocator);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/managed_ff_stream.h b/lib/kernels/include/kernels/managed_ff_stream.h
index 2f690b2eb3..26d5fb4911 100644
--- a/lib/kernels/include/kernels/managed_ff_stream.h
+++ b/lib/kernels/include/kernels/managed_ff_stream.h
@@ -19,6 +19,8 @@ struct ManagedFFStream {
 
   ffStream_t const &raw_stream() const;
 
+  void cleanup();
+
 private:
   ffStream_t *stream;
 };
diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
index f9f944c6ff..035ea574de 100644
--- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h
+++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
@@ -24,6 +24,8 @@ struct ManagedPerDeviceFFHandle {
 
   PerDeviceFFHandle const &raw_handle() const;
 
+  void cleanup();
+
 private:
   PerDeviceFFHandle *handle;
 };
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
index 4cb5bd83a2..e56bded737 100644
--- a/lib/kernels/src/accessor.cc
+++ b/lib/kernels/src/accessor.cc
@@ -26,7 +26,7 @@ void copy_accessor_data_to_l_from_r(
         dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost));
   } else {
     assert(src_device_type == DeviceType::GPU);
-    assert(src_device_type == DeviceType::CPU);
+    assert(dst_device_type == DeviceType::GPU);
     checkCUDA(cudaMemcpy(dst_accessor.ptr,
                          src_accessor.ptr,
                          num_bytes,
@@ -53,36 +53,6 @@ std::tuple<DataType const &,
   return std::tie(this->data_type, this->shape, this->ptr, this->device_type);
 }
 
-size_t GenericTensorAccessorW::calculate_index_offset(
-    std::vector<size_t> const &indices) const {
-
-  if (indices.size() != this->shape.num_dims()) {
-    throw mk_runtime_error(fmt::format(
-        "Number of indices ({}) does not match the number of dimensions ({}).",
-        indices.size(),
-        this->shape.num_dims()));
-  }
-
-  size_t offset = 0;
-  size_t multiplier = 1;
-
-  for (size_t i = 0; i < this->shape.num_dims(); i++) {
-    if (indices[i] >= this->shape.at(legion_dim_t(i))) {
-      throw mk_runtime_error(
-          fmt::format("In {} dimension, attempting to access index {} "
-                      "when only {} indexes exist",
-                      i,
-                      indices[i],
-                      this->shape.at(legion_dim_t(i))));
-    }
-
-    offset += indices[i] * multiplier;
-    multiplier *= this->shape.at(legion_dim_t(i));
-  }
-
-  return offset;
-}
-
 bool GenericTensorAccessorW::operator==(
     GenericTensorAccessorW const &other) const {
   return this->tie() == other.tie();
@@ -139,36 +109,6 @@ std::tuple<DataType const &,
   return std::tie(this->data_type, this->shape, this->ptr, this->device_type);
 }
 
-size_t GenericTensorAccessorR::calculate_index_offset(
-    std::vector<size_t> const &indices) const {
-
-  if (indices.size() != this->shape.num_dims()) {
-    throw mk_runtime_error(fmt::format(
-        "Number of indices ({}) does not match the number of dimensions ({}).",
-        indices.size(),
-        this->shape.num_dims()));
-  }
-
-  ssize_t offset = 0;
-  size_t multiplier = 1;
-
-  for (size_t i = 0; i < this->shape.num_dims(); i++) {
-    if (indices[i] >= this->shape.at(legion_dim_t(i))) {
-      throw mk_runtime_error(
-          fmt::format("In {} dimension, attempting to access index {} "
-                      "when only {} indexes exist",
-                      i,
-                      indices[i],
-                      this->shape.at(legion_dim_t(i))));
-    }
-
-    offset += indices[i] * multiplier;
-    multiplier *= this->shape.at(legion_dim_t(i));
-  }
-
-  return offset;
-}
-
 bool GenericTensorAccessorR::operator==(
     GenericTensorAccessorR const &other) const {
   return this->tie() == other.tie();
@@ -280,46 +220,4 @@ std::pair<ArrayShape, DataType>
   return std::make_pair(accessor.shape, accessor.data_type);
 }
 
-template <DataType DT>
-struct CopyTensorAccessorW {
-  GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor,
-                                    Allocator &allocator) {
-    TensorShape shape =
-        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
-    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
-
-    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
-
-    return dst_accessor;
-  }
-};
-
-GenericTensorAccessorW
-    copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
-                           Allocator &allocator) {
-  return DataTypeDispatch1<CopyTensorAccessorW>{}(
-      src_accessor.data_type, src_accessor, allocator);
-}
-
-template <DataType DT>
-struct CopyTensorAccessorR {
-  GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor,
-                                    Allocator &allocator) {
-    TensorShape shape =
-        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
-    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
-
-    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
-
-    return read_only_accessor_from_write_accessor(dst_accessor);
-  }
-};
-
-GenericTensorAccessorR
-    copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
-                           Allocator &allocator) {
-  return DataTypeDispatch1<CopyTensorAccessorR>{}(
-      src_accessor.data_type, src_accessor, allocator);
-}
-
 } // namespace FlexFlow
diff --git a/lib/kernels/src/copy_tensor_accessor.cc b/lib/kernels/src/copy_tensor_accessor.cc
new file mode 100644
index 0000000000..6a3ad8033a
--- /dev/null
+++ b/lib/kernels/src/copy_tensor_accessor.cc
@@ -0,0 +1,48 @@
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow {
+
+template <DataType DT>
+struct CopyTensorAccessorW {
+  GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor,
+                                    Allocator &allocator) {
+    TensorShape shape =
+        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+
+    return dst_accessor;
+  }
+};
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
+                           Allocator &allocator) {
+  return DataTypeDispatch1<CopyTensorAccessorW>{}(
+      src_accessor.data_type, src_accessor, allocator);
+}
+
+template <DataType DT>
+struct CopyTensorAccessorR {
+  GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor,
+                                    Allocator &allocator) {
+    TensorShape shape =
+        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+
+    return read_only_accessor_from_write_accessor(dst_accessor);
+  }
+};
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
+                           Allocator &allocator) {
+  return DataTypeDispatch1<CopyTensorAccessorR>{}(
+      src_accessor.data_type, src_accessor, allocator);
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc
index 25693b374d..cfcb44dac5 100644
--- a/lib/kernels/src/cpu/replicate_kernels.cc
+++ b/lib/kernels/src/cpu/replicate_kernels.cc
@@ -19,9 +19,9 @@ struct CPUBackwardKernel {
                   GenericTensorAccessorW &input,
                   size_t num_replicas) {
     using T = real_type_t<DT>;
-    for (size_t i = 0; i < input.shape.num_elements(); i++) {
+    for (int i = 0; i < input.shape.num_elements(); i++) {
       T cur_sum = 0;
-      for (size_t j = 0; j < num_replicas; j++) {
+      for (int j = 0; j < num_replicas; j++) {
         cur_sum += output.at<DT>({i, j});
       }
       input.at<DT>({i}) = cur_sum;
diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc
index e5b3719d74..bc73c80e9e 100644
--- a/lib/kernels/src/cpu/reverse_kernels.cc
+++ b/lib/kernels/src/cpu/reverse_kernels.cc
@@ -11,13 +11,13 @@ struct CPUReverseForwardKernel {
                   GenericTensorAccessorW &output) {
     assert(input.data_type == DT && output.data_type == DT);
 
-    size_t num_out_blocks = input.shape.at(legion_dim_t(0));
-    size_t reverse_dim_size = input.shape.at(legion_dim_t(1));
-    size_t in_block_size = input.shape.at(legion_dim_t(2));
+    int num_out_blocks = input.shape.at(legion_dim_t(0));
+    int reverse_dim_size = input.shape.at(legion_dim_t(1));
+    int in_block_size = input.shape.at(legion_dim_t(2));
 
-    for (size_t block_idx = 0; block_idx < num_out_blocks; block_idx++) {
-      for (size_t rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) {
-        for (size_t i = 0; i < in_block_size; i++) {
+    for (int block_idx = 0; block_idx < num_out_blocks; block_idx++) {
+      for (int rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) {
+        for (int i = 0; i < in_block_size; i++) {
           output.at<DT>({block_idx, rev_idx, i}) =
               input.at<DT>({num_out_blocks - 1 - block_idx,
                             reverse_dim_size - 1 - rev_idx,
diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu
index f13ebee67e..6b069218fa 100644
--- a/lib/kernels/src/cuda/ops/linear_kernels.cu
+++ b/lib/kernels/src/cuda/ops/linear_kernels.cu
@@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream,
                            batch_size,
                            in_dim,
                            &alpha,
-                           reinterpret_cast<void const *>(weight_ptr),
+                           static_cast<void const *>(weight_ptr),
                            weight_type,
                            in_dim,
-                           reinterpret_cast<void const *>(input_ptr),
+                           static_cast<void const *>(input_ptr),
                            input_type,
                            in_dim,
                            &beta,
-                           reinterpret_cast<void *>(output_ptr),
+                           static_cast<void *>(output_ptr),
                            output_type,
                            out_dim,
                            compute_type,
@@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream,
                              batch_size,
                              1,
                              &alpha,
-                             reinterpret_cast<void const *>(bias_ptr),
+                             static_cast<void const *>(bias_ptr),
                              weight_type,
                              1,
-                             reinterpret_cast<void const *>(m.one_ptr),
+                             static_cast<void const *>(m.one_ptr),
                              CUDA_R_32F,
                              1,
                              &alpha,
-                             reinterpret_cast<void *>(output_ptr),
+                             static_cast<void *>(output_ptr),
                              output_type,
                              out_dim,
                              compute_type,
@@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream,
                                       m.actiDesc,
                                       &alpha,
                                       m.outputTensor,
-                                      reinterpret_cast<void *>(output_ptr),
+                                      static_cast<void *>(output_ptr),
                                       &beta,
                                       m.outputTensor,
-                                      reinterpret_cast<void *>(output_ptr)));
+                                      static_cast<void *>(output_ptr)));
   } else if (m.activation == Activation::GELU) {
     size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size);
     constexpr float B = 0.7978845608028654f;   // sqrt(2.0/M_PI)
@@ -217,14 +217,14 @@ void backward_kernel(cudaStream_t stream,
   if (m.activation.has_value()) {
     if (m.activation == Activation::RELU) {
       relu_backward_kernel(m.output_type,
-                           reinterpret_cast<void *>(output_grad_ptr),
-                           reinterpret_cast<void const *>(output_ptr),
+                           static_cast<void *>(output_grad_ptr),
+                           static_cast<void const *>(output_ptr),
                            output_size,
                            stream);
     } else if (m.activation == Activation::SIGMOID) {
       sigmoid_backward_kernel(m.output_type,
-                              reinterpret_cast<void *>(output_grad_ptr),
-                              reinterpret_cast<void const *>(output_ptr),
+                              static_cast<void *>(output_grad_ptr),
+                              static_cast<void const *>(output_ptr),
                               output_size,
                               stream);
     } else {
@@ -241,14 +241,14 @@ void backward_kernel(cudaStream_t stream,
                            out_dim,
                            batch_size,
                            &alpha,
-                           reinterpret_cast<void const *>(input_ptr),
+                           static_cast<void const *>(input_ptr),
                            input_type,
                            in_dim,
-                           reinterpret_cast<void *>(output_grad_ptr),
+                           static_cast<void *>(output_grad_ptr),
                            output_type,
                            out_dim,
                            &alpha,
-                           reinterpret_cast<void *>(kernel_grad_ptr),
+                           static_cast<void *>(kernel_grad_ptr),
                            weight_type,
                            in_dim,
                            compute_type,
@@ -290,14 +290,14 @@ void backward_kernel(cudaStream_t stream,
                              out_dim,
                              batch_size,
                              &alpha,
-                             reinterpret_cast<void const *>(m.one_ptr),
+                             static_cast<void const *>(m.one_ptr),
                              CUDA_R_32F,
                              1,
-                             reinterpret_cast<void *>(output_grad_ptr),
+                             static_cast<void *>(output_grad_ptr),
                              output_type,
                              out_dim,
                              &alpha,
-                             reinterpret_cast<void *>(bias_grad_ptr),
+                             static_cast<void *>(bias_grad_ptr),
                              weight_type,
                              1,
                              compute_type,
@@ -313,14 +313,14 @@ void backward_kernel(cudaStream_t stream,
                              batch_size,
                              out_dim,
                              &alpha,
-                             reinterpret_cast<void const *>(kernel_ptr),
+                             static_cast<void const *>(kernel_ptr),
                              weight_type,
                              in_dim,
-                             reinterpret_cast<void *>(output_grad_ptr),
+                             static_cast<void *>(output_grad_ptr),
                              output_type,
                              out_dim,
                              &alpha,
-                             reinterpret_cast<void *>(input_grad_ptr),
+                             static_cast<void *>(input_grad_ptr),
                              input_type,
                              in_dim,
                              compute_type,
diff --git a/lib/kernels/src/managed_ff_stream.cc b/lib/kernels/src/managed_ff_stream.cc
index a8b44dc1d3..f0348aa91c 100644
--- a/lib/kernels/src/managed_ff_stream.cc
+++ b/lib/kernels/src/managed_ff_stream.cc
@@ -12,16 +12,17 @@ ManagedFFStream::ManagedFFStream(ManagedFFStream &&other) noexcept
 
 ManagedFFStream &ManagedFFStream::operator=(ManagedFFStream &&other) noexcept {
   if (this != &other) {
-    if (this->stream != nullptr) {
-      checkCUDA(cudaStreamDestroy(*this->stream));
-      delete stream;
-    }
+    this->cleanup();
     this->stream = std::exchange(other.stream, nullptr);
   }
   return *this;
 }
 
 ManagedFFStream::~ManagedFFStream() {
+  this->cleanup();
+}
+
+void ManagedFFStream::cleanup() {
   if (this->stream != nullptr) {
     checkCUDA(cudaStreamDestroy(*this->stream));
     delete this->stream;
diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc
index 5bd49dc26f..9f1737240e 100644
--- a/lib/kernels/src/managed_per_device_ff_handle.cc
+++ b/lib/kernels/src/managed_per_device_ff_handle.cc
@@ -5,7 +5,7 @@ namespace FlexFlow {
 
 ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
     size_t workSpaceSize, bool allowTensorOpMathConversion) {
-  this->handle = new PerDeviceFFHandle;
+  this->handle = new PerDeviceFFHandle{};
   this->handle->workSpaceSize = workSpaceSize;
   this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion;
 
@@ -21,18 +21,17 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
 ManagedPerDeviceFFHandle &ManagedPerDeviceFFHandle::operator=(
     ManagedPerDeviceFFHandle &&other) noexcept {
   if (this != &other) {
-    if (this->handle != nullptr) {
-      checkCUDNN(cudnnDestroy(this->handle->dnn));
-      checkCUBLAS(cublasDestroy(this->handle->blas));
-      checkCUDA(cudaFree(this->handle->workSpace));
-      delete this->handle;
-    }
+    this->cleanup();
     this->handle = std::exchange(other.handle, nullptr);
   }
   return *this;
 }
 
 ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() {
+  this->cleanup();
+}
+
+void ManagedPerDeviceFFHandle::cleanup() {
   if (this->handle != nullptr) {
     checkCUDNN(cudnnDestroy(this->handle->dnn));
     checkCUBLAS(cublasDestroy(this->handle->blas));
diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
index ce8a808454..605aa6ffa1 100644
--- a/lib/kernels/test/src/test_managed_ff_stream.cc
+++ b/lib/kernels/test/src/test_managed_ff_stream.cc
@@ -4,26 +4,28 @@
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Test Managed FF Stream") {
+  TEST_CASE("ManagedFFStream") {
     ManagedFFStream base_stream{};
     ffStream_t const *base_stream_ptr = &base_stream.raw_stream();
 
-    SUBCASE("Test ManagedFFStream Move Constructor") {
+    SUBCASE("move constructor") {
       ManagedFFStream new_stream(std::move(base_stream));
       CHECK(&base_stream.raw_stream() == nullptr);
       CHECK(&new_stream.raw_stream() == base_stream_ptr);
     }
 
-    SUBCASE("Test ManagedFFStream Assignment Operator") {
-      ManagedFFStream new_stream{};
-      new_stream = std::move(base_stream);
-      CHECK(&base_stream.raw_stream() == nullptr);
-      CHECK(&new_stream.raw_stream() == base_stream_ptr);
-    }
+    SUBCASE("move assignment operator") {
+      SUBCASE("move assign to other") {
+        ManagedFFStream new_stream{};
+        new_stream = std::move(base_stream);
+        CHECK(&base_stream.raw_stream() == nullptr);
+        CHECK(&new_stream.raw_stream() == base_stream_ptr);
+      }
 
-    SUBCASE("Test Self-Assignment") {
-      base_stream = std::move(base_stream);
-      CHECK(&base_stream.raw_stream() == base_stream_ptr);
+      SUBCASE("move assign to self") {
+        base_stream = std::move(base_stream);
+        CHECK(&base_stream.raw_stream() == base_stream_ptr);
+      }
     }
   }
 }
diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
index d39da03ba9..de3e5b72b1 100644
--- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
+++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
@@ -4,33 +4,35 @@
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Test Managed Per Device FF Handle") {
+  TEST_CASE("ManagedPerDeviceFFHandle") {
     ManagedPerDeviceFFHandle base_handle{1024 * 1024, true};
     PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle();
 
-    SUBCASE("Test ManagedPerDeviceFFHandle Constructor") {
+    SUBCASE("constructor") {
       CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024);
       CHECK(base_handle.raw_handle().allowTensorOpMathConversion == true);
     }
 
-    SUBCASE("Test ManagedPerDeviceFFHandle Move Constructor") {
+    SUBCASE("move constructor") {
       ManagedPerDeviceFFHandle new_handle(std::move(base_handle));
 
       CHECK(&base_handle.raw_handle() == nullptr);
       CHECK(&new_handle.raw_handle() == base_handle_ptr);
     }
 
-    SUBCASE("Test ManagedPerDeviceFFHandle Assignment Operator") {
-      ManagedPerDeviceFFHandle new_handle{1024 * 1024, true};
-      new_handle = std::move(base_handle);
+    SUBCASE("move assignment operator") {
+      SUBCASE("move assign to other") {
+        ManagedPerDeviceFFHandle new_handle{1024 * 1024, true};
+        new_handle = std::move(base_handle);
 
-      CHECK(&base_handle.raw_handle() == nullptr);
-      CHECK(&new_handle.raw_handle() == base_handle_ptr);
-    }
+        CHECK(&base_handle.raw_handle() == nullptr);
+        CHECK(&new_handle.raw_handle() == base_handle_ptr);
+      }
 
-    SUBCASE("Test Self-Assignment") {
-      base_handle = std::move(base_handle);
-      CHECK(&base_handle.raw_handle() == base_handle_ptr);
+      SUBCASE("move assign to self") {
+        base_handle = std::move(base_handle);
+        CHECK(&base_handle.raw_handle() == base_handle_ptr);
+      }
     }
   }
 }
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index 788ab52a7a..512c1ef33b 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -12,7 +12,11 @@
 // TEST_SUITE(FF_CUDA_TEST_SUITE) {
 //   TEST_CASE("Local Cost Estimator") {
 //     // local backing initialization
-//     ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true);
+//     ManagedPerDeviceFFHandle managed_handle{
+/*workSpaceSize=*/1024 * 1024,
+    /*allowTensorOpMathConversion=*/true
+}
+;
 
 //     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
 //         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
diff --git a/lib/op-attrs/include/op-attrs/make_datatype_value.h b/lib/op-attrs/include/op-attrs/make_datatype_value.h
new file mode 100644
index 0000000000..c3289c6309
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/make_datatype_value.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
+
+#include "op-attrs/datatype_value.dtg.h"
+
+namespace FlexFlow {
+
+DataTypeValue make_float_data_type_value(float value);
+DataTypeValue make_double_data_type_value(double value);
+DataTypeValue make_int32_data_type_value(int32_t value);
+DataTypeValue make_int64_data_type_value(int64_t value);
+DataTypeValue make_bool_data_type_value(bool value);
+
+}
+
+#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
diff --git a/lib/op-attrs/src/op-attrs/make_datatype_value.cc b/lib/op-attrs/src/op-attrs/make_datatype_value.cc
new file mode 100644
index 0000000000..bc402c433c
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/make_datatype_value.cc
@@ -0,0 +1,25 @@
+#include "op-attrs/make_datatype_value.h"
+
+namespace FlexFlow {
+
+DataTypeValue make_float_data_type_value(float value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_double_data_type_value(double value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_int32_data_type_value(int32_t value) {
+    return DataTypeValue{value};
+}
+
+DataTypeValue make_int64_data_type_value(int64_t value) {
+    return DataTypeValue{value};
+}
+
+DataTypeValue make_bool_data_type_value(bool value) {
+    return DataTypeValue{value};
+}
+  
+}
diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc
index 2d523c78ac..7ff5bec2f7 100644
--- a/lib/pcg/src/pcg/computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/computation_graph_builder.cc
@@ -3,6 +3,7 @@
 #include "op-attrs/get_incoming_tensor_roles.h"
 #include "op-attrs/get_op_type.h"
 #include "op-attrs/get_output_shapes.h"
+#include "op-attrs/make_datatype_value.h"
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/ops/batch_norm.h"
 #include "op-attrs/ops/broadcast.h"
@@ -613,14 +614,14 @@ tensor_guid_t ComputationGraphBuilder::batch_norm(
 
     TensorShape gamma_shape =
         throw_if_unexpected(get_gamma_weights_shape(attrs, input_shape));
-    InitializerAttrs gamma_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}};
+    InitializerAttrs gamma_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(1)}};
     weights.push_back(make_weight_attrs(gamma_shape, gamma_initializer));
 
     TensorShape beta_shape =
         throw_if_unexpected(get_beta_weights_shape(attrs, input_shape));
-    InitializerAttrs beta_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}};
+    InitializerAttrs beta_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(0)}};
     weights.push_back(make_weight_attrs(beta_shape, beta_initializer));
   }
 
@@ -692,8 +693,8 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention(
         get_input_bias_shape(attrs, query_shape, key_shape, value_shape));
     // initializer chosen based on
     // https://github.com/pytorch/pytorch/blob/31c4e0d37d8efc37a0697159e5b9121ec34d5141/torch/nn/modules/activation.py#L1120-L1121
-    InitializerAttrs input_bias_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}};
+    InitializerAttrs input_bias_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(0)}};
 
     weights.push_back(
         make_weight_attrs(input_bias_shape, input_bias_initializer));
@@ -702,8 +703,8 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention(
         get_output_bias_shape(attrs, query_shape, key_shape, value_shape));
     // initializer chosen based on
     // https://github.com/pytorch/pytorch/blob/31c4e0d37d8efc37a0697159e5b9121ec34d5141/torch/nn/modules/activation.py#L1120-L1121
-    InitializerAttrs output_bias_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}};
+    InitializerAttrs output_bias_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(0)}};
 
     weights.push_back(
         make_weight_attrs(output_bias_shape, output_bias_initializer));
@@ -888,14 +889,14 @@ tensor_guid_t ComputationGraphBuilder::layer_norm(
 
     TensorShape gamma_shape =
         throw_if_unexpected(get_gamma_weights_shape(attrs, input_shape));
-    InitializerAttrs gamma_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}};
+    InitializerAttrs gamma_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(1)}};
     weights.push_back(make_weight_attrs(gamma_shape, gamma_initializer));
 
     TensorShape beta_shape =
         throw_if_unexpected(get_beta_weights_shape(attrs, input_shape));
-    InitializerAttrs beta_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}};
+    InitializerAttrs beta_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(0)}};
     weights.push_back(make_weight_attrs(beta_shape, beta_initializer));
   }
 
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
index f33b4dcd17..79ac43ae66 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
@@ -1,5 +1,6 @@
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
 #include "op-attrs/get_incoming_tensor_roles.h"
+#include "op-attrs/make_datatype_value.h"
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/ops/batch_matmul.h"
 #include "op-attrs/ops/batch_norm.h"
@@ -385,14 +386,14 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::batch_norm(
 
     ParallelTensorShape gamma_shape =
         throw_if_unexpected(get_gamma_weights_shape(attrs, input_shape));
-    InitializerAttrs gamma_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}};
+    InitializerAttrs gamma_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(1)}};
     weights.push_back(make_weight_attrs(gamma_shape, gamma_initializer));
 
     ParallelTensorShape beta_shape =
         throw_if_unexpected(get_beta_weights_shape(attrs, input_shape));
-    InitializerAttrs beta_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}};
+    InitializerAttrs beta_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(0)}};
     weights.push_back(make_weight_attrs(beta_shape, beta_initializer));
   }
 

From 507df4a30d5b20eef90fc2f8ed75e2707a0e2b6c Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Thu, 21 Nov 2024 22:16:51 -0800
Subject: [PATCH 19/42] issue #1502 & issue #1540

---
 lib/kernels/CMakeLists.txt                    |   3 +-
 .../include/kernels/batch_norm_kernels.h      |   4 +-
 lib/kernels/include/kernels/cast_kernels.h    |   4 +-
 .../include/kernels/cast_kernels_cpu.h        |   4 +-
 lib/kernels/include/kernels/conv_2d_kernels.h |   4 +-
 .../include/kernels/element_unary_kernels.h   |   6 +-
 .../include/kernels/embedding_kernels.h       |   4 +-
 lib/kernels/include/kernels/flat_kernels.h    |   7 +-
 lib/kernels/include/kernels/linear_kernels.h  |   4 +-
 .../include/kernels/loss_function_kernels.h   |   2 +-
 lib/kernels/include/kernels/metrics_kernels.h |  29 +-
 .../include/kernels/optimizer_kernels.h       | 124 ++--
 .../include/kernels/partition_kernels.h       |   4 +-
 .../kernels}/per_device_op_state.variant.toml |   0
 lib/kernels/include/kernels/pool_2d_kernels.h |   9 +-
 .../include/kernels/reduction_kernels.h       |   4 +-
 lib/kernels/include/kernels/reshape_kernels.h |   4 +-
 lib/kernels/include/kernels/softmax_kernels.h |   2 +-
 .../include/kernels/transpose_kernels.h       |   4 +-
 lib/kernels/src/cpu/cast_kernels.cc           |  14 +-
 lib/kernels/src/cuda/cuda_helper.cu           |  12 +-
 lib/kernels/src/cuda/embedding_kernels.cu     | 549 ++++++++++++++----
 lib/kernels/src/cuda/metrics_functions.cu     | 101 ++--
 .../src/cuda/ops/batch_norm_kernels.cu        |   4 +-
 lib/kernels/src/cuda/ops/cast_kernels.cu      |  14 +-
 lib/kernels/src/cuda/ops/conv_2d_kernels.cu   |   4 +-
 .../src/cuda/ops/element_unary_kernels.cu     |  18 +-
 lib/kernels/src/cuda/ops/flat_kernels.cu      |   4 +-
 lib/kernels/src/cuda/ops/linear_kernels.cu    |   4 +-
 lib/kernels/src/cuda/ops/partition_kernels.cu |  10 +-
 lib/kernels/src/cuda/ops/pool_2d_kernels.cu   |   6 +-
 lib/kernels/src/cuda/ops/reduction_kernels.cu |  10 +-
 lib/kernels/src/cuda/ops/reshape_kernels.cu   |  10 +-
 lib/kernels/src/cuda/ops/softmax_kernels.cu   |   2 +-
 lib/kernels/src/cuda/ops/transpose_kernels.cu |   4 +-
 ...timizer_kernel.cu => optimizer_kernels.cu} |  57 +-
 .../local-execution/per_device_op_state.h     |   2 +-
 .../local-execution/task_argument_accessor.h  |   2 +-
 lib/local-execution/src/ops/batch_norm.cc     |   4 +-
 lib/local-execution/src/ops/conv_2d.cc        |   6 +-
 lib/local-execution/src/ops/element_unary.cc  |  10 +-
 lib/local-execution/src/ops/flat.cc           |   6 +-
 lib/local-execution/src/ops/linear.cc         |   4 +-
 lib/local-execution/src/ops/pool_2d.cc        |  10 +-
 lib/local-execution/src/ops/reduction.cc      |   6 +-
 lib/local-execution/src/ops/repartition.cc    |   4 +-
 lib/local-execution/src/ops/reshape.cc        |   4 +-
 lib/local-execution/src/ops/softmax.cc        |   2 +-
 lib/local-execution/src/ops/transpose.cc      |   4 +-
 ...device_state.cc => per_device_op_state.cc} |   0
 .../include/op-attrs/aggregate_op.enum.toml   |   5 +-
 .../include/op-attrs/datatype_value.h         |  16 +
 .../include/op-attrs/make_datatype_value.h    |   2 +-
 .../src/op-attrs/make_datatype_value.cc       |  10 +-
 lib/pcg/include/pcg/metric.h                  |  73 +++
 lib/pcg/src/pcg/metric.cc                     |  38 ++
 lib/runtime/src/metrics_functions.cc          |  33 --
 lib/runtime/src/metrics_functions.h           |  63 +-
 lib/runtime/src/ops/embedding.cc              |   4 +-
 59 files changed, 917 insertions(+), 436 deletions(-)
 rename lib/{local-execution/include/local-execution => kernels/include/kernels}/per_device_op_state.variant.toml (100%)
 rename lib/kernels/src/cuda/{optimizer_kernel.cu => optimizer_kernels.cu} (80%)
 rename lib/local-execution/src/{per_device_state.cc => per_device_op_state.cc} (100%)
 create mode 100644 lib/op-attrs/include/op-attrs/datatype_value.h
 create mode 100644 lib/pcg/include/pcg/metric.h
 create mode 100644 lib/pcg/src/pcg/metric.cc

diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
index fc91b7d3db..f5d88f102f 100644
--- a/lib/kernels/CMakeLists.txt
+++ b/lib/kernels/CMakeLists.txt
@@ -7,8 +7,7 @@ file(GLOB_RECURSE SRC
      CONFIGURE_DEPENDS
      LIST_DIRECTORIES False
      src/*.cc
-     src/cuda/cuda_helper.cu
-     src/cuda/ops/*.cu
+     src/cuda/*.cu
      )
 
 add_library(
diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h
index 4de6ac6af0..3fea92c86b 100644
--- a/lib/kernels/include/kernels/batch_norm_kernels.h
+++ b/lib/kernels/include/kernels/batch_norm_kernels.h
@@ -63,9 +63,9 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      BatchNormPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *output_grad_ptr,
                      float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
                      float *input_grad_ptr,
                      float const *scale_ptr,
                      float *scale_grad_ptr,
diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h
index 21e76fed1d..da13e0036d 100644
--- a/lib/kernels/include/kernels/cast_kernels.h
+++ b/lib/kernels/include/kernels/cast_kernels.h
@@ -11,8 +11,8 @@ void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input);
 
 } // namespace FlexFlow::Kernels::Cast
 
diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h
index 275476b4e6..a5df80d4da 100644
--- a/lib/kernels/include/kernels/cast_kernels_cpu.h
+++ b/lib/kernels/include/kernels/cast_kernels_cpu.h
@@ -9,8 +9,8 @@ namespace FlexFlow::Kernels::Cast {
 void cpu_forward_kernel(GenericTensorAccessorR const &input,
                         GenericTensorAccessorW const &output);
 
-void cpu_backward_kernel(GenericTensorAccessorR const &input,
-                         GenericTensorAccessorW const &output);
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW const &input);
 
 } // namespace FlexFlow::Kernels::Cast
 
diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h
index 217751e191..f49c8f50f4 100644
--- a/lib/kernels/include/kernels/conv_2d_kernels.h
+++ b/lib/kernels/include/kernels/conv_2d_kernels.h
@@ -60,10 +60,10 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      Conv2DPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
                      float const *output_ptr,
                      float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
                      float const *filter_ptr,
                      float *filter_grad_ptr,
                      float *bias_grad_ptr,
diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h
index 26ce4ecaec..c338f465ac 100644
--- a/lib/kernels/include/kernels/element_unary_kernels.h
+++ b/lib/kernels/include/kernels/element_unary_kernels.h
@@ -36,10 +36,10 @@ void backward_kernel(ffStream_t stream,
                      ElementUnaryPerDeviceState const &device_state,
                      ElementUnaryAttrs const &attrs,
                      PerDeviceFFHandle const &handle,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &input_grad,
                      GenericTensorAccessorR const &output,
-                     GenericTensorAccessorR const &output_grad);
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorR const &input,
+                     GenericTensorAccessorW const &input_grad);
 
 } // namespace Kernels::ElementUnary
 } // namespace FlexFlow
diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h
index 6d5141f489..f5b2561b56 100644
--- a/lib/kernels/include/kernels/embedding_kernels.h
+++ b/lib/kernels/include/kernels/embedding_kernels.h
@@ -17,11 +17,11 @@ void forward_kernel(ffStream_t stream,
                     int out_dim,
                     int batch_size);
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorW const &weight_grad,
-                     DataType input_data_type,
                      DataType output_data_type,
+                     DataType input_data_type,
                      std::optional<AggregateOp> aggr,
                      int in_dim,
                      int out_dim,
diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h
index 41b411c937..d60a1a5157 100644
--- a/lib/kernels/include/kernels/flat_kernels.h
+++ b/lib/kernels/include/kernels/flat_kernels.h
@@ -9,10 +9,11 @@ namespace FlexFlow::Kernels::Flat {
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR input,
                     float *output_ptr);
-void backward_kernel(ffStream_t stream,
+
+void backward_kernel(cudaStream_t stream,
                      GenericTensorAccessorR input,
-                     float *input_grad_ptr,
-                     float const *output_grad_ptr);
+                     float const *output_grad_ptr,
+                     float *input_grad_ptr);
 
 } // namespace FlexFlow::Kernels::Flat
 
diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h
index cff6563629..cd581b0a25 100644
--- a/lib/kernels/include/kernels/linear_kernels.h
+++ b/lib/kernels/include/kernels/linear_kernels.h
@@ -60,10 +60,10 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      LinearPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
                      float const *output_ptr,
                      float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
                      float const *kernel_ptr,
                      float *kernel_grad_ptr,
                      float *bias_ptr,
diff --git a/lib/kernels/include/kernels/loss_function_kernels.h b/lib/kernels/include/kernels/loss_function_kernels.h
index bab404f884..9e0dbd4ba1 100644
--- a/lib/kernels/include/kernels/loss_function_kernels.h
+++ b/lib/kernels/include/kernels/loss_function_kernels.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H
 
-#include "kernels/device.h"
+#include "device.h"
 
 namespace FlexFlow {
 
diff --git a/lib/kernels/include/kernels/metrics_kernels.h b/lib/kernels/include/kernels/metrics_kernels.h
index e4660808b9..d961ee7503 100644
--- a/lib/kernels/include/kernels/metrics_kernels.h
+++ b/lib/kernels/include/kernels/metrics_kernels.h
@@ -1,25 +1,24 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H
 
-#include "perf_metrics.h"
+#include "kernels/perf_metrics.h"
+#include "pcg/metric.h"
 
 namespace FlexFlow {
 
-void update_metrics_sparse_label_kernel(ffStream_t,
-                                        MetricsAttrs const &,
-                                        float const *logit_ptr,
-                                        int const *label_ptr,
-                                        int num_samples,
-                                        int num_classes,
-                                        PerfMetrics &perf_zc);
-void update_metrics_label_kernel(ffStream_t,
-                                 MetricsAttrs const &,
-                                 float const *logit_ptr,
-                                 float const *label_ptr,
-                                 int num_samples,
-                                 int num_classes,
-                                 PerfMetrics &perf_zc);
+void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr,
+                                                int const *label_ptr,
+                                                MetricsAttrs const *me,
+                                                int num_effective_samples,
+                                                int num_classes,
+                                                PerfMetrics &perf_zc);
 
+void update_metrics_label_kernel_wrapper(float const *logit_ptr,
+                                         float const *label_ptr,
+                                         MetricsAttrs const *me,
+                                         int num_samples,
+                                         int num_classes,
+                                         PerfMetrics &perf_zc);
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h
index 9ca6bf8e2b..3b5d292a5f 100644
--- a/lib/kernels/include/kernels/optimizer_kernels.h
+++ b/lib/kernels/include/kernels/optimizer_kernels.h
@@ -2,53 +2,91 @@
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 
 #include "device.h"
+#include "kernels/ff_handle.h"
+#include "kernels/nccl.h"
+#include "kernels/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
-void sgd_ps_update_task_gpu(ffStream_t,
-                            float lr,
-                            float momentum,
-                            bool nesterov,
+__global__ void sgd_update(size_t count,
+                           float lr,
+                           float weight_decay,
+                           float momentum,
+                           bool nesterov,
+                           float const *WGrad,
+                           float *V,
+                           float *W);
+
+class SGDOptimizer {
+public:
+  static __host__ void ps_update_task_gpu(SGDOptimizer const *op,
+                                          float const *w_grad_ptr,
+                                          size_t size,
+                                          int num_replicas,
+                                          float *w_ptr,
+                                          float *v_ptr);
+
+#ifdef FF_USE_NCCL
+  static __host__ void nccl_update_task_gpu(SGDOptimizer const *op,
+                                            PerDeviceOpState const *meta,
+                                            float const *w_grad_ptr,
+                                            size_t size,
+                                            float *w_ptr,
+                                            float *v_ptr);
+#endif
+
+public:
+  float lr;
+  float weight_decay;
+  float momentum;
+  bool nesterov;
+};
+
+__global__ void
+    add_kernel(int count, float scale, float const *src, float *dst);
+
+__global__ void scale_kernel(int count, float a, float b, float *ptr);
+
+__global__ void adam_update(int count,
+                            float alpha_t,
+                            float beta1,
+                            float beta2,
                             float weight_decay,
-                            float const *weight_grad_ptr,
-                            size_t size,
-                            int num_replicas,
-                            float *weight_ptr,
-                            float *sgd_v_ptr);
-
-void sgd_nccl_update_task_gpu(ffStream_t,
-                              float lr,
-                              float momentum,
-                              bool nesterov,
-                              float weight_decay PerDeviceFFHandle const &,
-                              float const *weight_grad_ptr,
-                              size_t size,
-                              float *weight_ptr,
-                              float *sgd_v_ptr);
-
-void adam_ps_update_task_gpu(ffStream_t,
-                             float alpha_t,
-                             float beta1,
-                             float beta2,
-                             float weight_decay,
-                             float epsilon,
-                             float const *weight_grad_ptr,
-                             float *adam_m_ptr,
-                             float *adam_v_ptr,
-                             float *weight_ptr);
-
-void adam_nccl_update_task_gpu(ffStream_t,
-                               float alpha_t,
-                               float beta1,
-                               float beta2,
-                               float weight_decay,
-                               float epsilon,
-                               PerDeviceFFHandle const &,
-                               float const *weight_grad_ptr,
-                               float *adam_m_ptr,
-                               float *adam_v_ptr,
-                               float *weight_ptr);
+                            float epsilon,
+                            float const *WGrad,
+                            float *M,
+                            float *V,
+                            float *W);
 
-} // namespace FlexFlow
+class AdamOptimizer {
+public:
+  static __host__ void ps_update_task_gpu(AdamOptimizer const *op,
+                                          float const *w_grad_ptr,
+                                          size_t size,
+                                          int num_replicas,
+                                          float *w_ptr,
+                                          float *v_ptr,
+                                          float *m_ptr);
 
+#ifdef FF_USE_NCCL
+  static __host__ void nccl_update_task_gpu(AdamOptimizer const *op,
+                                            PerDeviceOpState const *meta,
+                                            float const *w_grad_ptr,
+                                            size_t size,
+                                            float *w_ptr,
+                                            float *v_ptr,
+                                            float *m_ptr);
 #endif
+
+public:
+  float alpha;
+  float alpha_t;
+  float beta1;
+  float beta2;
+  float weight_decay;
+  float epsilon;
+};
+
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h
index e580c4a9de..9a303952d0 100644
--- a/lib/kernels/include/kernels/partition_kernels.h
+++ b/lib/kernels/include/kernels/partition_kernels.h
@@ -25,8 +25,8 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      RepartitionPerDeviceState const &m,
-                     GenericTensorAccessorW const &output_grad,
-                     GenericTensorAccessorR const &input_grad);
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorW const &input_grad);
 
 } // namespace Kernels::Repartition
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/per_device_op_state.variant.toml
rename to lib/kernels/include/kernels/per_device_op_state.variant.toml
diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h
index 191c23bc98..c0e57e2c9a 100644
--- a/lib/kernels/include/kernels/pool_2d_kernels.h
+++ b/lib/kernels/include/kernels/pool_2d_kernels.h
@@ -67,12 +67,13 @@ void forward_kernel(ffStream_t stream,
                     void const *input_ptr,
                     void *output_ptr);
 
-void backward_kernel(ffStream_t stream,
+void backward_kernel(cudaStream_t stream,
                      Pool2DPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
                      void const *output_ptr,
-                     void const *output_grad_ptr);
+                     void const *output_grad_ptr,
+                     void const *input_ptr,
+                     void *input_grad_ptr);
+
 
 } // namespace Kernels::Pool2D
 } // namespace FlexFlow
diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h
index 7e1e240ea4..12553edd5e 100644
--- a/lib/kernels/include/kernels/reduction_kernels.h
+++ b/lib/kernels/include/kernels/reduction_kernels.h
@@ -12,8 +12,8 @@ void forward_kernel(ffStream_t stream,
                     size_t num_replicas);
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input);
 
 } // namespace FlexFlow::Kernels::Reduction
 
diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h
index 5fa4382c43..6e19a9d251 100644
--- a/lib/kernels/include/kernels/reshape_kernels.h
+++ b/lib/kernels/include/kernels/reshape_kernels.h
@@ -24,8 +24,8 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      ReshapePerDeviceState const &per_device_state,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input);
 
 } // namespace Kernels::Reshape
 } // namespace FlexFlow
diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h
index 93135cb648..520ea61b64 100644
--- a/lib/kernels/include/kernels/softmax_kernels.h
+++ b/lib/kernels/include/kernels/softmax_kernels.h
@@ -30,8 +30,8 @@ void forward_kernel(ffStream_t stream,
                     float *output_ptr);
 
 void backward_kernel(ffStream_t stream,
-                     float *input_grad_ptr,
                      float const *output_grad_ptr,
+                     float *input_grad_ptr,
                      size_t num_elements);
 
 } // namespace Kernels::Softmax
diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h
index b48b7e0aa8..dbf78826cb 100644
--- a/lib/kernels/include/kernels/transpose_kernels.h
+++ b/lib/kernels/include/kernels/transpose_kernels.h
@@ -28,8 +28,8 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      TransposePerDeviceState const &m,
-                     GenericTensorAccessorW const &in_grad,
-                     GenericTensorAccessorR const &out_grad);
+                     GenericTensorAccessorR const &out_grad,
+                     GenericTensorAccessorW const &in_grad);
 
 } // namespace Kernels::Transpose
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc
index 5a00503fe4..08f5552afc 100644
--- a/lib/kernels/src/cpu/cast_kernels.cc
+++ b/lib/kernels/src/cpu/cast_kernels.cc
@@ -28,11 +28,11 @@ struct CPUForwardKernel {
 
 template <DataType IDT, DataType ODT>
 struct CPUBackwardKernel {
-  void operator()(GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &output) {
-    size_t volume = input.shape.get_volume();
+  void operator()(GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
+    size_t volume = output.shape.get_volume();
     cpu_cast_backward(
-        input.get<IDT>(), output.get<ODT>(), volume, cast_to<ODT>(1.0f));
+        output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
   }
 };
 
@@ -42,10 +42,10 @@ void cpu_forward_kernel(GenericTensorAccessorR const &input,
       input.data_type, output.data_type, input, output);
 }
 
-void cpu_backward_kernel(GenericTensorAccessorR const &input,
-                         GenericTensorAccessorW const &output) {
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW const &input) {
   DataTypeDispatch2<CPUBackwardKernel>{}(
-      input.data_type, output.data_type, input, output);
+      output.data_type, input.data_type, output, input);
 }
 
 } // namespace FlexFlow::Kernels::Cast
diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu
index 2ff02038f4..b30cf6a663 100644
--- a/lib/kernels/src/cuda/cuda_helper.cu
+++ b/lib/kernels/src/cuda/cuda_helper.cu
@@ -29,13 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) {
 #error "Unknown device, please make sure if CUDA is enabled"
 #endif
 
-__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
+__global__ void scale_kernel(float *ptr, size_t size, float a, float b) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = (b - a) * ptr[i] + a;
   }
 }
 
-__global__ void ones_kernel(float *ptr, coord_t size) {
+__global__ void ones_kernel(float *ptr, size_t size) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = 1.0f;
   }
@@ -49,7 +49,7 @@ __global__ void assign_kernel(DT *ptr, size_t size, DT value) {
 }
 
 template <typename DT>
-__global__ void copy_kernel(DT *dst, const DT *src, coord_t size) {
+__global__ void copy_kernel(DT *dst, const DT *src, size_t size) {
   CUDA_KERNEL_LOOP(i, size) {
     dst[i] = src[i];
   }
@@ -281,11 +281,11 @@ template __global__ void
     add_kernel<bool>(bool *dst, bool const *src, unsigned long size);
 
 template __global__ void
-    copy_kernel<float>(float *dst, float const *src, coord_t size);
+    copy_kernel<float>(float *dst, float const *src, size_t size);
 template __global__ void
-    copy_kernel<int32_t>(int32_t *dst, int32_t const *src, coord_t size);
+    copy_kernel<int32_t>(int32_t *dst, int32_t const *src, size_t size);
 template __global__ void
-    copy_kernel<int64_t>(int64_t *dst, int64_t const *src, coord_t size);
+    copy_kernel<int64_t>(int64_t *dst, int64_t const *src, size_t size);
 
 template __global__ void apply_add_with_scale<float>(float *data_ptr,
                                                      float const *grad_ptr,
diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu
index e6a614ba70..c83e9f0a94 100644
--- a/lib/kernels/src/cuda/embedding_kernels.cu
+++ b/lib/kernels/src/cuda/embedding_kernels.cu
@@ -17,12 +17,11 @@
 #include "kernels/datatype_dispatch.h"
 #include "kernels/embedding_kernels.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Embedding {
+namespace FlexFlow::Kernels::Embedding {
 
 void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) {
   cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
 
   // Randomly initialize the intput tensor to avoid out of index range issues
   rand_generate_int<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
@@ -31,36 +30,14 @@ void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) {
 
 void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p) {
   cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
 
   // Randomly initialize the intput tensor to avoid out of index range issues
   rand_generate_int<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
       ptr, size, p);
 }
 
-template <typename TI, typename TD>
-__global__ void embed_forward_no_aggr(
-    TI const *input, TD *output, TD const *embed, int out_dim, int batch_size);
-template <typename TI, typename TD>
-__global__ void embed_forward_with_aggr(TI const *input,
-                                        TD *output,
-                                        TD const *embed,
-                                        int out_dim,
-                                        int in_dim,
-                                        int batch_size,
-                                        std::optional<AggregateOp> aggr);
-template <typename TI, typename TD>
-__global__ void embed_backward_no_aggr(
-    TI const *input, TD const *output, TD *embed, int out_dim, int batch_size);
-template <typename TI, typename TD>
-__global__ void embed_backward_with_aggr(TI const *input,
-                                         TD const *output,
-                                         TD *embed,
-                                         int out_dim,
-                                         int in_dim,
-                                         int batch_size,
-                                         std::optional<AggregateOp> aggr);
-
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_no_aggr(int32_t const *input,
                                       TD *output,
                                       TD const *embed,
@@ -75,7 +52,7 @@ __global__ void embed_forward_no_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_no_aggr(int64_t const *input,
                                       TD *output,
                                       TD const *embed,
@@ -90,14 +67,14 @@ __global__ void embed_forward_no_aggr(int64_t const *input,
   }
 }
 
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_with_aggr(int32_t const *input,
                                         TD *output,
                                         TD const *embed,
                                         int out_dim,
                                         int in_dim,
                                         int batch_size,
-                                        std::optional<AggregateOp> aggr) {
+                                        AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     output[i] = 0;
@@ -115,14 +92,14 @@ __global__ void embed_forward_with_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_with_aggr(int64_t const *input,
                                         TD *output,
                                         TD const *embed,
                                         int out_dim,
                                         int in_dim,
                                         int batch_size,
-                                        std::optional<AggregateOp> aggr) {
+                                        AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     output[i] = 0;
@@ -140,7 +117,7 @@ __global__ void embed_forward_with_aggr(int64_t const *input,
   }
 }
 
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_no_aggr(int32_t const *input,
                                        TD const *output,
                                        TD *embed,
@@ -154,7 +131,7 @@ __global__ void embed_backward_no_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_no_aggr(int64_t const *input,
                                        TD const *output,
                                        TD *embed,
@@ -171,11 +148,11 @@ __global__ void embed_backward_no_aggr(int64_t const *input,
 // Specialization for half type
 
 template <>
-__global__ void embed_backward_no_aggr<int32_t, half>(int32_t const *input,
-                                                      half const *output,
-                                                      half *embed,
-                                                      int out_dim,
-                                                      int batch_size) {
+__global__ void embed_backward_no_aggr<half>(int32_t const *input,
+                                             half const *output,
+                                             half *embed,
+                                             int out_dim,
+                                             int batch_size) {
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
     int off = i % out_dim;
@@ -192,11 +169,11 @@ __global__ void embed_backward_no_aggr<int32_t, half>(int32_t const *input,
 }
 
 template <>
-__global__ void embed_backward_no_aggr<int64_t, half>(int64_t const *input,
-                                                      half const *output,
-                                                      half *embed,
-                                                      int out_dim,
-                                                      int batch_size) {
+__global__ void embed_backward_no_aggr<half>(int64_t const *input,
+                                             half const *output,
+                                             half *embed,
+                                             int out_dim,
+                                             int batch_size) {
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
     int off = i % out_dim;
@@ -212,14 +189,14 @@ __global__ void embed_backward_no_aggr<int64_t, half>(int64_t const *input,
   }
 }
 
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_with_aggr(int32_t const *input,
                                          TD const *output,
                                          TD *embed,
                                          int out_dim,
                                          int in_dim,
                                          int batch_size,
-                                         std::optional<AggregateOp> aggr) {
+                                         AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -238,14 +215,14 @@ __global__ void embed_backward_with_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_with_aggr(int64_t const *input,
                                          TD const *output,
                                          TD *embed,
                                          int out_dim,
                                          int in_dim,
                                          int batch_size,
-                                         std::optional<AggregateOp> aggr) {
+                                         AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -267,14 +244,13 @@ __global__ void embed_backward_with_aggr(int64_t const *input,
 // Specialization for half type
 
 template <>
-__global__ void
-    embed_backward_with_aggr<int32_t, half>(int32_t const *input,
-                                            half const *output,
-                                            half *embed,
-                                            int out_dim,
-                                            int in_dim,
-                                            int batch_size,
-                                            std::optional<AggregateOp> aggr) {
+__global__ void embed_backward_with_aggr<half>(int32_t const *input,
+                                               half const *output,
+                                               half *embed,
+                                               int out_dim,
+                                               int in_dim,
+                                               int batch_size,
+                                               AggregateOp aggr) {
   half scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -301,14 +277,13 @@ __global__ void
 }
 
 template <>
-__global__ void
-    embed_backward_with_aggr<int64_t, half>(int64_t const *input,
-                                            half const *output,
-                                            half *embed,
-                                            int out_dim,
-                                            int in_dim,
-                                            int batch_size,
-                                            std::optional<AggregateOp> aggr) {
+__global__ void embed_backward_with_aggr<half>(int64_t const *input,
+                                               half const *output,
+                                               half *embed,
+                                               int out_dim,
+                                               int in_dim,
+                                               int batch_size,
+                                               AggregateOp aggr) {
   half scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -351,35 +326,219 @@ struct ForwardKernel {
                   int in_dim,
                   int out_dim,
                   int batch_size) {
-    assert(input.data_type == DataType::INT32 ||
-           input.data_type == DataType::INT64);
-    assert(weight.data_type == DataType::HALF ||
-           weight.data_type == DataType::FLOAT ||
-           weight.data_type == DataType::DOUBLE);
+    throw mk_runtime_error(fmt::format(
+        "Invalid type combination: input type {} and output type {}", TI, TD));
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT32, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<float><<<GET_BLOCKS(output.shape.get_volume()),
+                                     CUDA_NUM_THREADS,
+                                     0,
+                                     stream>>>(input.get<DataType::INT32>(),
+                                               output.get<DataType::FLOAT>(),
+                                               weight.get<DataType::FLOAT>(),
+                                               out_dim,
+                                               batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<float><<<GET_BLOCKS(output.shape.get_volume()),
+                                       CUDA_NUM_THREADS,
+                                       0,
+                                       stream>>>(input.get<DataType::INT32>(),
+                                                 output.get<DataType::FLOAT>(),
+                                                 weight.get<DataType::FLOAT>(),
+                                                 out_dim,
+                                                 in_dim,
+                                                 batch_size,
+                                                 aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT32, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<half><<<GET_BLOCKS(output.shape.get_volume()),
+                                    CUDA_NUM_THREADS,
+                                    0,
+                                    stream>>>(input.get<DataType::INT32>(),
+                                              output.get<DataType::HALF>(),
+                                              weight.get<DataType::HALF>(),
+                                              out_dim,
+                                              batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<half><<<GET_BLOCKS(output.shape.get_volume()),
+                                      CUDA_NUM_THREADS,
+                                      0,
+                                      stream>>>(input.get<DataType::INT32>(),
+                                                output.get<DataType::HALF>(),
+                                                weight.get<DataType::HALF>(),
+                                                out_dim,
+                                                in_dim,
+                                                batch_size,
+                                                aggr.value());
+    }
+  }
+};
 
+template <>
+struct ForwardKernel<DataType::INT32, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
     if (!aggr.has_value()) {
-      embed_forward_no_aggr<real_type_t<TI>, real_type_t<TD>>
+      embed_forward_no_aggr<double><<<GET_BLOCKS(output.shape.get_volume()),
+                                      CUDA_NUM_THREADS,
+                                      0,
+                                      stream>>>(input.get<DataType::INT32>(),
+                                                output.get<DataType::DOUBLE>(),
+                                                weight.get<DataType::DOUBLE>(),
+                                                out_dim,
+                                                batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<double>
           <<<GET_BLOCKS(output.shape.get_volume()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight.get<TD>(),
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
                        out_dim,
-                       batch_size);
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT64, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<float><<<GET_BLOCKS(output.shape.get_volume()),
+                                     CUDA_NUM_THREADS,
+                                     0,
+                                     stream>>>(input.get<DataType::INT64>(),
+                                               output.get<DataType::FLOAT>(),
+                                               weight.get<DataType::FLOAT>(),
+                                               out_dim,
+                                               batch_size);
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
-      embed_forward_with_aggr<real_type_t<TI>, real_type_t<TD>>
+      embed_forward_with_aggr<float><<<GET_BLOCKS(output.shape.get_volume()),
+                                       CUDA_NUM_THREADS,
+                                       0,
+                                       stream>>>(input.get<DataType::INT64>(),
+                                                 output.get<DataType::FLOAT>(),
+                                                 weight.get<DataType::FLOAT>(),
+                                                 out_dim,
+                                                 in_dim,
+                                                 batch_size,
+                                                 aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT64, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<half><<<GET_BLOCKS(output.shape.get_volume()),
+                                    CUDA_NUM_THREADS,
+                                    0,
+                                    stream>>>(input.get<DataType::INT64>(),
+                                              output.get<DataType::HALF>(),
+                                              weight.get<DataType::HALF>(),
+                                              out_dim,
+                                              batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<half><<<GET_BLOCKS(output.shape.get_volume()),
+                                      CUDA_NUM_THREADS,
+                                      0,
+                                      stream>>>(input.get<DataType::INT64>(),
+                                                output.get<DataType::HALF>(),
+                                                weight.get<DataType::HALF>(),
+                                                out_dim,
+                                                in_dim,
+                                                batch_size,
+                                                aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT64, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<double><<<GET_BLOCKS(output.shape.get_volume()),
+                                      CUDA_NUM_THREADS,
+                                      0,
+                                      stream>>>(input.get<DataType::INT64>(),
+                                                output.get<DataType::DOUBLE>(),
+                                                weight.get<DataType::DOUBLE>(),
+                                                out_dim,
+                                                batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<double>
           <<<GET_BLOCKS(output.shape.get_volume()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight.get<TD>(),
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
                        out_dim,
                        in_dim,
                        batch_size,
-                       aggr);
+                       aggr.value());
     }
   }
 };
@@ -388,39 +547,229 @@ template <DataType TI, DataType TD>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
                   std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
                   GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    throw mk_runtime_error(fmt::format(
+        "Invalid type combination: input type {} and output type {}", TI, TD));
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT32, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
                   GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &weight_grad,
                   int in_dim,
                   int out_dim,
                   int batch_size) {
-    assert(input.data_type == DataType::INT32 ||
-           input.data_type == DataType::INT64);
-    assert(output.data_type == DataType::HALF ||
-           output.data_type == DataType::FLOAT ||
-           output.data_type == DataType::DOUBLE);
     if (!aggr.has_value()) {
-      embed_backward_no_aggr<real_type_t<TI>, real_type_t<TD>>
+      embed_backward_no_aggr<float>
           <<<GET_BLOCKS(output.shape.get_volume()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight_grad.get<TD>(),
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
                        out_dim,
                        batch_size);
     } else {
-      embed_backward_with_aggr<real_type_t<TI>, real_type_t<TD>>
+      embed_backward_with_aggr<float>
           <<<GET_BLOCKS(output.shape.get_volume()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight_grad.get<TD>(),
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
                        out_dim,
                        in_dim,
                        batch_size,
-                       aggr);
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT32, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT32, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT64, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT64, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT64, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
     }
   }
 };
@@ -448,27 +797,25 @@ void forward_kernel(ffStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorW const &weight_grad,
-                     DataType input_data_type,
                      DataType output_data_type,
+                     DataType input_data_type,
                      std::optional<AggregateOp> aggr,
                      int in_dim,
                      int out_dim,
                      int batch_size) {
-  DataTypeDispatch2<BackwardKernel>{}(input_data_type,
-                                      output_data_type,
+  DataTypeDispatch2<BackwardKernel>{}(output_data_type,
+                                      input_data_type,
                                       stream,
                                       aggr,
-                                      input,
                                       output,
+                                      input,
                                       weight_grad,
                                       in_dim,
                                       out_dim,
                                       batch_size);
 }
 
-} // namespace Embedding
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Embedding
diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu
index 2e037eb472..2901f1d374 100644
--- a/lib/kernels/src/cuda/metrics_functions.cu
+++ b/lib/kernels/src/cuda/metrics_functions.cu
@@ -13,17 +13,42 @@
  * limitations under the License.
  */
 
-#include "flexflow/model.h"
-#include "flexflow/utils/cuda_helper.h"
+#include "device.h"
+#include "kernels/metrics_kernels.h"
+#include "kernels/perf_metrics.h"
+#include "pcg/metric.h"
 
 namespace FlexFlow {
 
+struct CUDAPerfMetrics {
+  int train_all;
+  int train_correct;
+  float cce_loss;
+  float sparse_cce_loss;
+  float mse_loss;
+  float rmse_loss;
+  float mae_loss;
+  double start_time;
+  double current_time;
+
+  CUDAPerfMetrics() = delete;
+  CUDAPerfMetrics(PerfMetrics const &perf)
+      : train_all(perf.train_all),
+        train_correct(perf.train_correct.value_or(-1)),
+        cce_loss(perf.cce_loss.value_or(-1)),
+        sparse_cce_loss(perf.sparse_cce_loss.value_or(-1)),
+        mse_loss(perf.mse_loss.value_or(-1)),
+        rmse_loss(perf.rmse_loss.value_or(-1)),
+        mae_loss(perf.mae_loss.value_or(-1)), start_time(perf.start_time),
+        current_time(perf.current_time) {}
+};
+
 float const LOG_MIN_VALUE = 0.00000001f;
 
 __global__ void update_metrics_sparse_label_kernel(float const *logits,
                                                    int const *labels,
-                                                   PerfMetrics *perf,
-                                                   const Metrics metrics,
+                                                   CUDAPerfMetrics *perf,
+                                                   const MetricsAttrs metrics,
                                                    int num_samples,
                                                    int num_classes) {
   CUDA_KERNEL_LOOP(b, num_samples) {
@@ -72,8 +97,8 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits,
 
 __global__ void update_metrics_label_kernel(float const *logits,
                                             float const *labels,
-                                            PerfMetrics *perf,
-                                            const Metrics metrics,
+                                            CUDAPerfMetrics *perf,
+                                            const MetricsAttrs metrics,
                                             int num_samples,
                                             int num_classes) {
   CUDA_KERNEL_LOOP(b, num_samples) {
@@ -136,17 +161,17 @@ __global__ void update_metrics_label_kernel(float const *logits,
   }
 }
 
-void Metrics::update_metrics_sparse_label_kernel_wrapper(
-    float const *logit_ptr,
-    int const *label_ptr,
-    Metrics const *me,
-    int num_effective_samples,
-    int num_classes,
-    PerfMetrics &perf_zc) {
-  PerfMetrics *perf;
-  checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics)));
-  checkCUDA(
-      cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice));
+void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr,
+                                                int const *label_ptr,
+                                                MetricsAttrs const *me,
+                                                int num_effective_samples,
+                                                int num_classes,
+                                                PerfMetrics &perf_zc) {
+  CUDAPerfMetrics perf(perf_zc);
+  CUDAPerfMetrics *perf_cuda;
+  checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics)));
+  checkCUDA(cudaMemcpy(
+      perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice));
 
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
@@ -154,32 +179,36 @@ void Metrics::update_metrics_sparse_label_kernel_wrapper(
                                        CUDA_NUM_THREADS,
                                        0,
                                        stream>>>(
-      logit_ptr, label_ptr, perf, *me, num_effective_samples, num_classes);
+      logit_ptr, label_ptr, perf_cuda, *me, num_effective_samples, num_classes);
   checkCUDA(cudaStreamSynchronize(stream));
-  checkCUDA(
-      cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost));
-  checkCUDA(cudaFree(perf));
+  checkCUDA(cudaMemcpy(
+      &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost));
+  checkCUDA(cudaFree(perf_cuda));
 }
 
-void Metrics::update_metrics_label_kernel_wrapper(float const *logit_ptr,
-                                                  float const *label_ptr,
-                                                  Metrics const *me,
-                                                  int num_samples,
-                                                  int num_classes,
-                                                  PerfMetrics &perf_zc) {
-  PerfMetrics *perf;
-  checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics)));
-  checkCUDA(
-      cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice));
+void update_metrics_label_kernel_wrapper(float const *logit_ptr,
+                                         float const *label_ptr,
+                                         MetricsAttrs const *me,
+                                         int num_samples,
+                                         int num_classes,
+                                         PerfMetrics &perf_zc) {
+  CUDAPerfMetrics perf(perf_zc);
+  CUDAPerfMetrics *perf_cuda;
+  checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics)));
+  checkCUDA(cudaMemcpy(
+      perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice));
 
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  update_metrics_label_kernel<<<GET_BLOCKS(num_samples), 256, 0, stream>>>(
-      logit_ptr, label_ptr, perf, *me, num_samples, num_classes);
+  update_metrics_label_kernel<<<GET_BLOCKS(num_samples),
+                                256,
+                                0,
+                                stream>>>(
+      logit_ptr, label_ptr, perf_cuda, *me, num_samples, num_classes);
   checkCUDA(cudaStreamSynchronize(stream));
-  checkCUDA(
-      cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost));
-  checkCUDA(cudaFree(perf));
+  checkCUDA(cudaMemcpy(
+      &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost));
+  checkCUDA(cudaFree(perf_cuda));
 }
 
 }; // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
index 6c6e17a181..512981e32b 100644
--- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
+++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
@@ -53,9 +53,9 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      BatchNormPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *output_grad_ptr,
                      float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
                      float *input_grad_ptr,
                      float const *scale_ptr,
                      float *scale_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu
index dc342fd0e0..afc3e1f7ef 100644
--- a/lib/kernels/src/cuda/ops/cast_kernels.cu
+++ b/lib/kernels/src/cuda/ops/cast_kernels.cu
@@ -50,11 +50,11 @@ struct ForwardKernel {
 template <DataType IDT, DataType ODT>
 struct BackwardKernel {
   void operator()(ffStream_t stream,
-                  GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &output) {
-    size_t volume = input.shape.get_volume();
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
+    size_t volume = output.shape.get_volume();
     cast_backward<<<GET_BLOCKS(volume), CUDA_NUM_THREADS, 0, stream>>>(
-        input.get<IDT>(), output.get<ODT>(), volume, cast_to<ODT>(1.0f));
+        output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
   }
 };
 
@@ -66,10 +66,10 @@ void forward_kernel(ffStream_t stream,
 }
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output) {
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
   DataTypeDispatch2<BackwardKernel>{}(
-      input.data_type, output.data_type, stream, input, output);
+      output.data_type, input.data_type, stream, output, input);
 }
 
 } // namespace Cast
diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
index e3a4c97a31..0a4024ba8a 100644
--- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
+++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
@@ -313,10 +313,10 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      Conv2DPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
                      float const *output_ptr,
                      float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
                      float const *filter_ptr,
                      float *filter_grad_ptr,
                      float *bias_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
index a35d28fa8c..687a9fa220 100644
--- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu
+++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
@@ -290,10 +290,10 @@ struct BackwardKernel {
                   OperatorType op_type,
                   std::optional<float> scalar,
                   PerDeviceFFHandle const &handle,
-                  GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &input_grad,
                   GenericTensorAccessorR const &output,
-                  GenericTensorAccessorR const &output_grad) {
+                  GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &input_grad) {
     checkCUDNN(cudnnSetStream(handle.dnn, stream));
 
     if (use_cudnn(op_type)) {
@@ -356,20 +356,20 @@ void backward_kernel(ffStream_t stream,
                      ElementUnaryPerDeviceState const &device_state,
                      ElementUnaryAttrs const &attrs,
                      PerDeviceFFHandle const &handle,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &input_grad,
                      GenericTensorAccessorR const &output,
-                     GenericTensorAccessorR const &output_grad) {
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorR const &input,
+                     GenericTensorAccessorW const &input_grad) {
   DataTypeDispatch1<BackwardKernel>{}(input.data_type,
                                       stream,
                                       device_state,
                                       get_op_type(attrs),
                                       attrs.scalar,
                                       handle,
-                                      input,
-                                      input_grad,
                                       output,
-                                      output_grad);
+                                      output_grad,
+                                      input,
+                                      input_grad);
 }
 
 } // namespace ElementUnary
diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu
index 941db108a0..f661e5fb0a 100644
--- a/lib/kernels/src/cuda/ops/flat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/flat_kernels.cu
@@ -34,8 +34,8 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      GenericTensorAccessorR input,
-                     float *input_grad_ptr,
-                     float const *output_grad_ptr) {
+                     float const *output_grad_ptr,
+                     float *input_grad_ptr) {
 
   float alpha = 1.0f;
   apply_add_with_scale<float>
diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu
index 6b069218fa..0d5a772918 100644
--- a/lib/kernels/src/cuda/ops/linear_kernels.cu
+++ b/lib/kernels/src/cuda/ops/linear_kernels.cu
@@ -191,10 +191,10 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      LinearPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
                      float const *output_ptr,
                      float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
                      float const *kernel_ptr,
                      float *kernel_grad_ptr,
                      float *bias_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu
index 1d07efb5fa..3687c1cedf 100644
--- a/lib/kernels/src/cuda/ops/partition_kernels.cu
+++ b/lib/kernels/src/cuda/ops/partition_kernels.cu
@@ -39,8 +39,8 @@ template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
                   RepartitionPerDeviceState const &m,
-                  GenericTensorAccessorW const &input_grad,
-                  GenericTensorAccessorR const &output_grad) {
+                  GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorW const &input_grad) {
     add_kernel<real_type_t<T>><<<GET_BLOCKS(input_grad.shape.num_elements()),
                                  CUDA_NUM_THREADS,
                                  0,
@@ -65,10 +65,10 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      RepartitionPerDeviceState const &m,
-                     GenericTensorAccessorW const &input_grad,
-                     GenericTensorAccessorR const &output_grad) {
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorW const &input_grad) {
   DataTypeDispatch1<BackwardKernel>{}(
-      m.data_type, stream, m, input_grad, output_grad);
+      m.data_type, stream, m, output_grad, input_grad);
 }
 
 } // namespace Repartition
diff --git a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
index 51fa29d289..f8b35ec885 100644
--- a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
+++ b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
@@ -112,10 +112,10 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      Pool2DPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
                      void const *output_ptr,
-                     void const *output_grad_ptr) {
+                     void const *output_grad_ptr,
+                     void const *input_ptr,
+                     void *input_grad_ptr) {
 
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
 
diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu
index 0c6ba7d8e3..9c3e8dcc40 100644
--- a/lib/kernels/src/cuda/ops/reduction_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu
@@ -54,8 +54,8 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  GenericTensorAccessorW const &input,
-                  GenericTensorAccessorR const &output) {
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
     checkCUDA(cudaMemcpyAsync(input.get<T>(),
                               output.get<T>(),
                               input.shape.num_elements() * size_of_datatype(T),
@@ -73,9 +73,9 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output) {
-  DataTypeDispatch1<BackwardKernel>{}(input.data_type, stream, input, output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
+  DataTypeDispatch1<BackwardKernel>{}(output.data_type, stream, output, input);
 }
 
 } // namespace Reduction
diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu
index 5b7843a3a5..b7a328ca08 100644
--- a/lib/kernels/src/cuda/ops/reshape_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu
@@ -42,8 +42,8 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  GenericTensorAccessorW const &input,
-                  GenericTensorAccessorR const &output) {
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
     float alpha = 1.0f;
     apply_add_with_scale<real_type_t<T>>
         <<<GET_BLOCKS(input.shape.num_elements()),
@@ -65,9 +65,9 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      ReshapePerDeviceState const &m,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output) {
-  DataTypeDispatch1<BackwardKernel>{}(m.data_type, stream, input, output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
+  DataTypeDispatch1<BackwardKernel>{}(m.data_type, stream, output, input);
 }
 
 } // namespace Reshape
diff --git a/lib/kernels/src/cuda/ops/softmax_kernels.cu b/lib/kernels/src/cuda/ops/softmax_kernels.cu
index 93ed85de18..d2498d08a4 100644
--- a/lib/kernels/src/cuda/ops/softmax_kernels.cu
+++ b/lib/kernels/src/cuda/ops/softmax_kernels.cu
@@ -61,8 +61,8 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     float *input_grad_ptr,
                      float const *output_grad_ptr,
+                     float *input_grad_ptr,
                      size_t num_elements) {
 
   checkCUDA(cudaMemcpyAsync(input_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu
index 3b3f80944d..37e1a08326 100644
--- a/lib/kernels/src/cuda/ops/transpose_kernels.cu
+++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu
@@ -91,8 +91,8 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      TransposePerDeviceState const &m,
-                     GenericTensorAccessorW const &in_grad,
-                     GenericTensorAccessorR const &out_grad) {
+                     GenericTensorAccessorR const &out_grad,
+                     GenericTensorAccessorW const &in_grad) {
 
   TransposeStrides info;
   info.num_dim = in_grad.shape.num_dims();
diff --git a/lib/kernels/src/cuda/optimizer_kernel.cu b/lib/kernels/src/cuda/optimizer_kernels.cu
similarity index 80%
rename from lib/kernels/src/cuda/optimizer_kernel.cu
rename to lib/kernels/src/cuda/optimizer_kernels.cu
index 439eed9dec..237a277b21 100644
--- a/lib/kernels/src/cuda/optimizer_kernel.cu
+++ b/lib/kernels/src/cuda/optimizer_kernels.cu
@@ -13,7 +13,9 @@
  * limitations under the License.
  */
 
+#include "device.h"
 #include "kernels/optimizer_kernels.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
@@ -80,13 +82,28 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
   // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr);
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+
+  const auto& state = meta->raw_variant;
+  ncclComm_t comm = std::visit([](const auto& s) -> ncclComm_t {
+    using T = std::decay_t<decltype(s)>;
+    if constexpr (std::is_same_v<T, FlexFlow::ElementUnaryPerDeviceState> ||
+                 std::is_same_v<T, FlexFlow::ReshapePerDeviceState> ||
+                 std::is_same_v<T, FlexFlow::TopKPerDeviceState> ||
+                 std::is_same_v<T, FlexFlow::TransposePerDeviceState>) {
+      throw mk_runtime_error("State type does not support NCCL operations");
+    } else {
+      return s.handle.ncclComm;
+    }
+  }, state);
+
   checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
-                          size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
-                          stream));
+                         (float *)w_grad_ptr,
+                         size,
+                         ncclFloat,
+                         ncclSum,
+                         comm,
+                         stream));
+
   // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
   // print_tensor<float>((float*)w_grad_ptr, 16, "[After ncclAllReduce]");
 
@@ -157,7 +174,7 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
   for (int i = 1; i < num_replicas; i++) {
     float const *src = w_grad_ptr + i * size;
     add_kernel<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-        size, 1.0f, src, (float *)w_grad_ptr);
+        (float *)w_grad_ptr, src, size);
   }
   // checkCUDA(cudaDeviceSynchronize());
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
@@ -188,13 +205,27 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
   // Use NCCL to sync gradients
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+ 
+  const auto& state = meta->raw_variant;
+  ncclComm_t comm = std::visit([](const auto& s) -> ncclComm_t {
+    using T = std::decay_t<decltype(s)>;
+    if constexpr (std::is_same_v<T, FlexFlow::ElementUnaryPerDeviceState> ||
+                 std::is_same_v<T, FlexFlow::ReshapePerDeviceState> ||
+                 std::is_same_v<T, FlexFlow::TopKPerDeviceState> ||
+                 std::is_same_v<T, FlexFlow::TransposePerDeviceState>) {
+      throw mk_runtime_error("State type does not support NCCL operations");
+    } else {
+      return s.handle.ncclComm;
+    }
+  }, state);
+
   checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
-                          size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
-                          stream));
+                         (float *)w_grad_ptr,
+                         size,
+                         ncclFloat,
+                         ncclSum,
+                         comm,
+                         stream));
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
   //         op->alpha, op->alpha_t, op->weight_decay);
   //  Step 2: Adam update
diff --git a/lib/local-execution/include/local-execution/per_device_op_state.h b/lib/local-execution/include/local-execution/per_device_op_state.h
index 1edd5b6360..f1f357a86e 100644
--- a/lib/local-execution/include/local-execution/per_device_op_state.h
+++ b/lib/local-execution/include/local-execution/per_device_op_state.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H
 #define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H
 
+#include "kernels/per_device_op_state.dtg.h"
 #include "local-execution/device_specific_device_states.dtg.h"
-#include "local-execution/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
index 54c8dfc5f1..48584588e3 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 
+#include "kernels/per_device_op_state.dtg.h"
 #include "local-execution/device_specific.h"
 #include "local-execution/itask_argument_accessor.h"
-#include "local-execution/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc
index 851566fc02..3aed3111c7 100644
--- a/lib/local-execution/src/ops/batch_norm.cc
+++ b/lib/local-execution/src/ops/batch_norm.cc
@@ -133,9 +133,9 @@ static std::optional<float>
                  profiling,
                  "[BatchNorm] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 input.get_float_ptr(),
-                 output_grad.get_float_ptr(),
                  output.get_float_ptr(),
+                 output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
                  input_grad.get_float_ptr(),
                  scale.get_float_ptr(),
                  scale_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc
index d5c6e7f851..d7c5c22170 100644
--- a/lib/local-execution/src/ops/conv_2d.cc
+++ b/lib/local-execution/src/ops/conv_2d.cc
@@ -108,8 +108,8 @@ static std::optional<float>
       acc.get_argument<Conv2DPerDeviceState>(PER_DEVICE_STATE);
   auto attrs = acc.get_argument<Conv2DAttrs>(ATTRS);
 
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto filter = acc.get_tensor<Permissions::RO>(FILTER);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
@@ -121,10 +121,10 @@ static std::optional<float>
                  profiling,
                  "[Conv2d] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
                  output.get_float_ptr(),
                  output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr(),
                  filter.get_float_ptr(),
                  filter_grad.get_float_ptr(),
                  bias_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc
index 4ee609bd6c..10f1dce294 100644
--- a/lib/local-execution/src/ops/element_unary.cc
+++ b/lib/local-execution/src/ops/element_unary.cc
@@ -89,10 +89,10 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
 
   auto const &attrs = acc.get_argument<ElementUnaryAttrs>(ATTRS);
   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
@@ -107,10 +107,10 @@ static std::optional<float>
                  per_device_state,
                  attrs,
                  handle,
-                 input,
-                 input_grad,
                  output,
-                 output_grad);
+                 output_grad,
+                 input,
+                 input_grad);
 }
 
 TaskImplFunction get_element_unary_init_task_impl() {
diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc
index 3fe5029fa1..8d998a8672 100644
--- a/lib/local-execution/src/ops/flat.cc
+++ b/lib/local-execution/src/ops/flat.cc
@@ -41,15 +41,15 @@ static std::optional<float>
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
 
   return profile(backward_kernel,
                  profiling,
                  "[Flat] backward_time = {:.2lf}ms\n",
                  input,
-                 input_grad.get_float_ptr(),
-                 output_grad.get_float_ptr());
+                 output_grad.get_float_ptr(),
+                 input_grad.get_float_ptr());
 }
 
 TaskImplFunction get_flat_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
index fd2c1cd5e4..1eb0360db4 100644
--- a/lib/local-execution/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -148,10 +148,10 @@ static std::optional<float>
                  profiling,
                  "[Linear] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
                  output.get_float_ptr(),
                  output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr(),
                  weight.get_float_ptr(),
                  weight_grad.get_float_ptr(),
                  bias_ptr,
diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc
index 3ab33a2ad6..a1167a731c 100644
--- a/lib/local-execution/src/ops/pool_2d.cc
+++ b/lib/local-execution/src/ops/pool_2d.cc
@@ -125,19 +125,19 @@ static std::optional<float>
   Pool2DPerDeviceState state =
       acc.get_argument<Pool2DPerDeviceState>(PER_DEVICE_STATE);
 
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor<Permissions::RW>(INPUT);
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto output_grad = acc.get_tensor<Permissions::RO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor<Permissions::RW>(INPUT);
 
   return profile(backward_kernel,
                  profiling,
                  "[Pool2D] backward_time = {:.2lf}ms\n",
                  state,
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
                  output.get_float_ptr(),
-                 output_grad.get_float_ptr());
+                 output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr());
 }
 
 TaskImplFunction get_pool_2d_init_task_impl() {
diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc
index a58d79a4f8..1e85d7186e 100644
--- a/lib/local-execution/src/ops/reduction.cc
+++ b/lib/local-execution/src/ops/reduction.cc
@@ -64,13 +64,13 @@ static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   return profile(backward_kernel,
                  profiling,
                  "[Reduction] backward_time = {:.2lf}ms\n",
-                 input_grad,
-                 output_grad);
+                 output_grad,
+                 input_grad);
 }
 
 TaskImplFunction get_reduction_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc
index 73692f4a13..655e1f238b 100644
--- a/lib/local-execution/src/ops/repartition.cc
+++ b/lib/local-execution/src/ops/repartition.cc
@@ -86,8 +86,8 @@ static std::optional<float>
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<RepartitionPerDeviceState>(PER_DEVICE_STATE);
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
 
   return profile(backward_kernel,
                  profiling,
diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc
index 7584d405eb..761718a9a7 100644
--- a/lib/local-execution/src/ops/reshape.cc
+++ b/lib/local-execution/src/ops/reshape.cc
@@ -87,8 +87,8 @@ static std::optional<float>
                  profiling,
                  "[Reshape] backward time = {:.2lf}ms\n",
                  per_device_state,
-                 input_grad,
-                 output_grad);
+                 output_grad,
+                 input_grad);
 }
 
 TaskImplFunction get_reshape_init_task_impl() {
diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc
index 8d412c739b..71a6ce435e 100644
--- a/lib/local-execution/src/ops/softmax.cc
+++ b/lib/local-execution/src/ops/softmax.cc
@@ -107,8 +107,8 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[SoftMax] backward_time = {:.2lf}ms\n",
-                 input_grad.get_float_ptr(),
                  output_grad.get_float_ptr(),
+                 input_grad.get_float_ptr(),
                  output_grad.shape.get_volume());
 }
 
diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc
index 53cf1f20ed..30310d3349 100644
--- a/lib/local-execution/src/ops/transpose.cc
+++ b/lib/local-execution/src/ops/transpose.cc
@@ -98,8 +98,8 @@ static std::optional<float>
                  profiling,
                  "[Transpose] Backward_time = {:.2lf} [ms]",
                  per_device_state,
-                 input_grad,
-                 output_grad);
+                 output_grad,
+                 input_grad);
 }
 
 OpTaskInvocation backward(TransposeAttrs const &attrs) {
diff --git a/lib/local-execution/src/per_device_state.cc b/lib/local-execution/src/per_device_op_state.cc
similarity index 100%
rename from lib/local-execution/src/per_device_state.cc
rename to lib/local-execution/src/per_device_op_state.cc
diff --git a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
index 27aa50f38f..2c524c120a 100644
--- a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
+++ b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
@@ -10,5 +10,8 @@ features = [
 [[values]]
 name = "SUM"
 
-[[value]]
+[[values]]
 name = "AVG"
+
+[[values]]
+name = "NONE"
diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h
new file mode 100644
index 0000000000..723e69bddd
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/datatype_value.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H
+
+#include "op-attrs/datatype_value.dtg.h"
+
+namespace FlexFlow {
+
+DataTypeValue make_float_data_type_value(float value);
+DataTypeValue make_double_data_type_value(double value);
+DataTypeValue make_int32_data_type_value(int32_t value);
+DataTypeValue make_int64_data_type_value(int64_t value);
+DataTypeValue make_bool_data_type_value(bool value);
+
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
diff --git a/lib/op-attrs/include/op-attrs/make_datatype_value.h b/lib/op-attrs/include/op-attrs/make_datatype_value.h
index c3289c6309..af4792dd9e 100644
--- a/lib/op-attrs/include/op-attrs/make_datatype_value.h
+++ b/lib/op-attrs/include/op-attrs/make_datatype_value.h
@@ -11,6 +11,6 @@ DataTypeValue make_int32_data_type_value(int32_t value);
 DataTypeValue make_int64_data_type_value(int64_t value);
 DataTypeValue make_bool_data_type_value(bool value);
 
-}
+} // namespace FlexFlow
 
 #endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
diff --git a/lib/op-attrs/src/op-attrs/make_datatype_value.cc b/lib/op-attrs/src/op-attrs/make_datatype_value.cc
index bc402c433c..76d712949a 100644
--- a/lib/op-attrs/src/op-attrs/make_datatype_value.cc
+++ b/lib/op-attrs/src/op-attrs/make_datatype_value.cc
@@ -11,15 +11,15 @@ DataTypeValue make_double_data_type_value(double value) {
 }
 
 DataTypeValue make_int32_data_type_value(int32_t value) {
-    return DataTypeValue{value};
+  return DataTypeValue{value};
 }
 
 DataTypeValue make_int64_data_type_value(int64_t value) {
-    return DataTypeValue{value};
+  return DataTypeValue{value};
 }
 
 DataTypeValue make_bool_data_type_value(bool value) {
-    return DataTypeValue{value};
-}
-  
+  return DataTypeValue{value};
 }
+
+} // namespace FlexFlow
diff --git a/lib/pcg/include/pcg/metric.h b/lib/pcg/include/pcg/metric.h
new file mode 100644
index 0000000000..f56078772e
--- /dev/null
+++ b/lib/pcg/include/pcg/metric.h
@@ -0,0 +1,73 @@
+#ifndef _FF_METRICS_H_
+#define _FF_METRICS_H_
+
+#include <unordered_set>
+#include "utils/fmt.h"
+#include "op-attrs/ops/loss_functions/loss_functions.h"
+
+namespace FlexFlow {
+
+enum class Metric {
+  ACCURACY,
+  CATEGORICAL_CROSSENTROPY,
+  SPARSE_CATEGORICAL_CROSSENTROPY,
+  MEAN_SQUARED_ERROR,
+  ROOT_MEAN_SQUARED_ERROR,
+  MEAN_ABSOLUTE_ERROR,
+};
+
+class MetricsAttrs {
+public:
+  MetricsAttrs() = delete;
+  MetricsAttrs(LossFunction, std::vector<Metric> const &);
+
+public:
+  LossFunction loss_type;
+  bool measure_accuracy;
+  bool measure_categorical_crossentropy;
+  bool measure_sparse_categorical_crossentropy;
+  bool measure_mean_squared_error;
+  bool measure_root_mean_squared_error;
+  bool measure_mean_absolute_error;
+};
+
+} // namespace FlexFlow
+
+namespace fmt {
+
+template <>
+struct formatter<::FlexFlow::Metric> : formatter<string_view> {
+  template <typename FormatContext>
+  auto format(::FlexFlow::Metric m, FormatContext &ctx) const
+      -> decltype(ctx.out()) {
+    using namespace FlexFlow;
+
+    string_view name = "unknown";
+    switch (m) {
+      case Metric::ACCURACY:
+        name = "Accuracy";
+        break;
+      case Metric::CATEGORICAL_CROSSENTROPY:
+        name = "CategoricalCrossEntropy";
+        break;
+      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
+        name = "SparseCategoricalCrossEntropy";
+        break;
+      case Metric::MEAN_SQUARED_ERROR:
+        name = "MeanSquaredError";
+        break;
+      case Metric::ROOT_MEAN_SQUARED_ERROR:
+        name = "RootMeanSquaredError";
+        break;
+      case Metric::MEAN_ABSOLUTE_ERROR:
+        name = "MeanAbsoluteError";
+        break;
+    }
+    return formatter<string_view>::format(name, ctx);
+  }
+};
+
+} // namespace fmt
+
+
+#endif
diff --git a/lib/pcg/src/pcg/metric.cc b/lib/pcg/src/pcg/metric.cc
new file mode 100644
index 0000000000..eb0d6bc5d0
--- /dev/null
+++ b/lib/pcg/src/pcg/metric.cc
@@ -0,0 +1,38 @@
+#include "pcg/metric.h"
+
+namespace FlexFlow {
+MetricsAttrs::MetricsAttrs(LossFunction _loss_type,
+                          std::vector<Metric> const &metrics)
+  : loss_type(_loss_type), measure_accuracy(false),
+    measure_categorical_crossentropy(false),
+    measure_sparse_categorical_crossentropy(false),
+    measure_mean_squared_error(false), measure_root_mean_squared_error(false),
+    measure_mean_absolute_error(false) {
+for (Metric const &m : metrics) {
+  switch (m) {
+    case Metric::ACCURACY:
+      measure_accuracy = true;
+      continue;
+    case Metric::CATEGORICAL_CROSSENTROPY:
+      measure_categorical_crossentropy = true;
+      continue;
+    case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
+      measure_sparse_categorical_crossentropy = true;
+      continue;
+    case Metric::MEAN_SQUARED_ERROR:
+      measure_mean_squared_error = true;
+      continue;
+    case Metric::ROOT_MEAN_SQUARED_ERROR:
+      measure_root_mean_squared_error = true;
+      continue;
+    case Metric::MEAN_ABSOLUTE_ERROR:
+      measure_mean_absolute_error = true;
+      continue;
+    default:
+      throw mk_runtime_error("Initializing MetricsAttrs with unrecogonized metrics type");
+  }
+}
+}
+
+  
+}
diff --git a/lib/runtime/src/metrics_functions.cc b/lib/runtime/src/metrics_functions.cc
index feb6e704b2..33e15baed2 100644
--- a/lib/runtime/src/metrics_functions.cc
+++ b/lib/runtime/src/metrics_functions.cc
@@ -25,39 +25,6 @@ namespace FlexFlow {
 
 LegionRuntime::Logger::Category log_metrics("metrics");
 
-MetricsAttrs::MetricsAttrs(LossFunction _loss_type,
-                           std::vector<Metric> const &metrics)
-    : loss_type(_loss_type), measure_accuracy(false),
-      measure_categorical_crossentropy(false),
-      measure_sparse_categorical_crossentropy(false),
-      measure_mean_squared_error(false), measure_root_mean_squared_error(false),
-      measure_mean_absolute_error(false) {
-  for (Metric const &m : metrics) {
-    switch (m) {
-      case Metric::ACCURACY:
-        measure_accuracy = true;
-        continue;
-      case Metric::CATEGORICAL_CROSSENTROPY:
-        measure_categorical_crossentropy = true;
-        continue;
-      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
-        measure_sparse_categorical_crossentropy = true;
-        continue;
-      case Metric::MEAN_SQUARED_ERROR:
-        measure_mean_squared_error = true;
-        continue;
-      case Metric::ROOT_MEAN_SQUARED_ERROR:
-        measure_root_mean_squared_error = true;
-        continue;
-      case Metric::MEAN_ABSOLUTE_ERROR:
-        measure_mean_absolute_error = true;
-        continue;
-      default:
-        throw mk_runtime_error("Unrecogonized metrics type {}", m);
-    }
-  }
-}
-
 enum Slots {
   LOGIT,
   LABEL,
diff --git a/lib/runtime/src/metrics_functions.h b/lib/runtime/src/metrics_functions.h
index fbb0b633bf..73dc3bbc51 100644
--- a/lib/runtime/src/metrics_functions.h
+++ b/lib/runtime/src/metrics_functions.h
@@ -16,38 +16,13 @@
 #ifndef _FF_METRICS_FUNCTIONS_H_
 #define _FF_METRICS_FUNCTIONS_H_
 
+#include "kernels/metric.h"
 #include "kernels/perf_metrics.h"
 #include "legion.h"
-#include "op-attrs/ops/loss_functions.h"
 #include "task_spec/task_invocation.h"
-#include "utils/fmt.h"
 
 namespace FlexFlow {
 
-enum class Metric {
-  ACCURACY,
-  CATEGORICAL_CROSSENTROPY,
-  SPARSE_CATEGORICAL_CROSSENTROPY,
-  MEAN_SQUARED_ERROR,
-  ROOT_MEAN_SQUARED_ERROR,
-  MEAN_ABSOLUTE_ERROR,
-};
-
-class MetricsAttrs {
-public:
-  MetricsAttrs() = delete;
-  MetricsAttrs(LossFunction, std::vector<Metric> const &);
-
-public:
-  LossFunction loss_type;
-  bool measure_accuracy;
-  bool measure_categorical_crossentropy;
-  bool measure_sparse_categorical_crossentropy;
-  bool measure_mean_squared_error;
-  bool measure_root_mean_squared_error;
-  bool measure_mean_absolute_error;
-};
-
 TypedIndexTaskInvocation<PerfMetrics>
     compute_metrics(MetricsAttrs const &,
                     parallel_tensor_guid_t const &logit,
@@ -79,40 +54,4 @@ VISITABLE_STRUCT(::FlexFlow::MetricsAttrs,
                  measure_root_mean_squared_error,
                  measure_mean_absolute_error);
 
-namespace fmt {
-
-template <>
-struct formatter<::FlexFlow::Metric> : formatter<string_view> {
-  template <typename FormatContext>
-  auto format(::FlexFlow::Metric m, FormatContext &ctx) const
-      -> decltype(ctx.out()) {
-    using namespace FlexFlow;
-
-    string_view name = "unknown";
-    switch (m) {
-      case Metric::ACCURACY:
-        name = "Accuracy";
-        break;
-      case Metric::CATEGORICAL_CROSSENTROPY:
-        name = "CategoricalCrossEntropy";
-        break;
-      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
-        name = "SparseCategoricalCrossEntropy";
-        break;
-      case Metric::MEAN_SQUARED_ERROR:
-        name = "MeanSquaredError";
-        break;
-      case Metric::ROOT_MEAN_SQUARED_ERROR:
-        name = "RootMeanSquaredError";
-        break;
-      case Metric::MEAN_ABSOLUTE_ERROR:
-        name = "MeanAbsoluteError";
-        break;
-    }
-    return formatter<string_view>::format(name, ctx);
-  }
-};
-
-} // namespace fmt
-
 #endif
diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc
index 253fd3cb4f..83e7c15460 100644
--- a/lib/runtime/src/ops/embedding.cc
+++ b/lib/runtime/src/ops/embedding.cc
@@ -77,11 +77,11 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[Embedding] backward_time = {:.2lf}ms\n",
-                 input,
                  output,
+                 input,
                  weight_grad,
-                 input.data_type,
                  output.data_type,
+                 input.data_type,
                  attrs.aggr,
                  input.shape.get_dim(),
                  output.shape.get_dim(),

From c64a55c3cfbad062d3fa6fd6b705c4cdb7509fac Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Thu, 21 Nov 2024 22:46:25 -0800
Subject: [PATCH 20/42] format check

---
 lib/kernels/include/kernels/pool_2d_kernels.h |  1 -
 lib/kernels/src/cuda/metrics_functions.cu     |  5 +-
 lib/kernels/src/cuda/optimizer_kernels.cu     | 76 +++++++++----------
 lib/pcg/include/pcg/metric.h                  |  5 +-
 lib/pcg/src/pcg/metric.cc                     | 62 +++++++--------
 5 files changed, 69 insertions(+), 80 deletions(-)

diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h
index c0e57e2c9a..ad0a52efb9 100644
--- a/lib/kernels/include/kernels/pool_2d_kernels.h
+++ b/lib/kernels/include/kernels/pool_2d_kernels.h
@@ -74,7 +74,6 @@ void backward_kernel(cudaStream_t stream,
                      void const *input_ptr,
                      void *input_grad_ptr);
 
-
 } // namespace Kernels::Pool2D
 } // namespace FlexFlow
 
diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu
index 2901f1d374..0250f829ec 100644
--- a/lib/kernels/src/cuda/metrics_functions.cu
+++ b/lib/kernels/src/cuda/metrics_functions.cu
@@ -200,10 +200,7 @@ void update_metrics_label_kernel_wrapper(float const *logit_ptr,
 
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  update_metrics_label_kernel<<<GET_BLOCKS(num_samples),
-                                256,
-                                0,
-                                stream>>>(
+  update_metrics_label_kernel<<<GET_BLOCKS(num_samples), 256, 0, stream>>>(
       logit_ptr, label_ptr, perf_cuda, *me, num_samples, num_classes);
   checkCUDA(cudaStreamSynchronize(stream));
   checkCUDA(cudaMemcpy(
diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu
index 237a277b21..1c6954a0b0 100644
--- a/lib/kernels/src/cuda/optimizer_kernels.cu
+++ b/lib/kernels/src/cuda/optimizer_kernels.cu
@@ -83,26 +83,23 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
-  const auto& state = meta->raw_variant;
-  ncclComm_t comm = std::visit([](const auto& s) -> ncclComm_t {
-    using T = std::decay_t<decltype(s)>;
-    if constexpr (std::is_same_v<T, FlexFlow::ElementUnaryPerDeviceState> ||
-                 std::is_same_v<T, FlexFlow::ReshapePerDeviceState> ||
-                 std::is_same_v<T, FlexFlow::TopKPerDeviceState> ||
-                 std::is_same_v<T, FlexFlow::TransposePerDeviceState>) {
-      throw mk_runtime_error("State type does not support NCCL operations");
-    } else {
-      return s.handle.ncclComm;
-    }
-  }, state);
-
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                         (float *)w_grad_ptr,
-                         size,
-                         ncclFloat,
-                         ncclSum,
-                         comm,
-                         stream));
+  auto const &state = meta->raw_variant;
+  ncclComm_t comm = std::visit(
+      [](auto const &s) -> ncclComm_t {
+        using T = std::decay_t<decltype(s)>;
+        if constexpr (std::is_same_v<T, FlexFlow::ElementUnaryPerDeviceState> ||
+                      std::is_same_v<T, FlexFlow::ReshapePerDeviceState> ||
+                      std::is_same_v<T, FlexFlow::TopKPerDeviceState> ||
+                      std::is_same_v<T, FlexFlow::TransposePerDeviceState>) {
+          throw mk_runtime_error("State type does not support NCCL operations");
+        } else {
+          return s.handle.ncclComm;
+        }
+      },
+      state);
+
+  checkNCCL(ncclAllReduce(
+      w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream));
 
   // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
   // print_tensor<float>((float*)w_grad_ptr, 16, "[After ncclAllReduce]");
@@ -205,27 +202,24 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
   // Use NCCL to sync gradients
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
- 
-  const auto& state = meta->raw_variant;
-  ncclComm_t comm = std::visit([](const auto& s) -> ncclComm_t {
-    using T = std::decay_t<decltype(s)>;
-    if constexpr (std::is_same_v<T, FlexFlow::ElementUnaryPerDeviceState> ||
-                 std::is_same_v<T, FlexFlow::ReshapePerDeviceState> ||
-                 std::is_same_v<T, FlexFlow::TopKPerDeviceState> ||
-                 std::is_same_v<T, FlexFlow::TransposePerDeviceState>) {
-      throw mk_runtime_error("State type does not support NCCL operations");
-    } else {
-      return s.handle.ncclComm;
-    }
-  }, state);
-
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                         (float *)w_grad_ptr,
-                         size,
-                         ncclFloat,
-                         ncclSum,
-                         comm,
-                         stream));
+
+  auto const &state = meta->raw_variant;
+  ncclComm_t comm = std::visit(
+      [](auto const &s) -> ncclComm_t {
+        using T = std::decay_t<decltype(s)>;
+        if constexpr (std::is_same_v<T, FlexFlow::ElementUnaryPerDeviceState> ||
+                      std::is_same_v<T, FlexFlow::ReshapePerDeviceState> ||
+                      std::is_same_v<T, FlexFlow::TopKPerDeviceState> ||
+                      std::is_same_v<T, FlexFlow::TransposePerDeviceState>) {
+          throw mk_runtime_error("State type does not support NCCL operations");
+        } else {
+          return s.handle.ncclComm;
+        }
+      },
+      state);
+
+  checkNCCL(ncclAllReduce(
+      w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream));
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
   //         op->alpha, op->alpha_t, op->weight_decay);
   //  Step 2: Adam update
diff --git a/lib/pcg/include/pcg/metric.h b/lib/pcg/include/pcg/metric.h
index f56078772e..718919112f 100644
--- a/lib/pcg/include/pcg/metric.h
+++ b/lib/pcg/include/pcg/metric.h
@@ -1,9 +1,9 @@
 #ifndef _FF_METRICS_H_
 #define _FF_METRICS_H_
 
-#include <unordered_set>
-#include "utils/fmt.h"
 #include "op-attrs/ops/loss_functions/loss_functions.h"
+#include "utils/fmt.h"
+#include <unordered_set>
 
 namespace FlexFlow {
 
@@ -69,5 +69,4 @@ struct formatter<::FlexFlow::Metric> : formatter<string_view> {
 
 } // namespace fmt
 
-
 #endif
diff --git a/lib/pcg/src/pcg/metric.cc b/lib/pcg/src/pcg/metric.cc
index eb0d6bc5d0..69aba90d12 100644
--- a/lib/pcg/src/pcg/metric.cc
+++ b/lib/pcg/src/pcg/metric.cc
@@ -2,37 +2,37 @@
 
 namespace FlexFlow {
 MetricsAttrs::MetricsAttrs(LossFunction _loss_type,
-                          std::vector<Metric> const &metrics)
-  : loss_type(_loss_type), measure_accuracy(false),
-    measure_categorical_crossentropy(false),
-    measure_sparse_categorical_crossentropy(false),
-    measure_mean_squared_error(false), measure_root_mean_squared_error(false),
-    measure_mean_absolute_error(false) {
-for (Metric const &m : metrics) {
-  switch (m) {
-    case Metric::ACCURACY:
-      measure_accuracy = true;
-      continue;
-    case Metric::CATEGORICAL_CROSSENTROPY:
-      measure_categorical_crossentropy = true;
-      continue;
-    case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
-      measure_sparse_categorical_crossentropy = true;
-      continue;
-    case Metric::MEAN_SQUARED_ERROR:
-      measure_mean_squared_error = true;
-      continue;
-    case Metric::ROOT_MEAN_SQUARED_ERROR:
-      measure_root_mean_squared_error = true;
-      continue;
-    case Metric::MEAN_ABSOLUTE_ERROR:
-      measure_mean_absolute_error = true;
-      continue;
-    default:
-      throw mk_runtime_error("Initializing MetricsAttrs with unrecogonized metrics type");
+                           std::vector<Metric> const &metrics)
+    : loss_type(_loss_type), measure_accuracy(false),
+      measure_categorical_crossentropy(false),
+      measure_sparse_categorical_crossentropy(false),
+      measure_mean_squared_error(false), measure_root_mean_squared_error(false),
+      measure_mean_absolute_error(false) {
+  for (Metric const &m : metrics) {
+    switch (m) {
+      case Metric::ACCURACY:
+        measure_accuracy = true;
+        continue;
+      case Metric::CATEGORICAL_CROSSENTROPY:
+        measure_categorical_crossentropy = true;
+        continue;
+      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
+        measure_sparse_categorical_crossentropy = true;
+        continue;
+      case Metric::MEAN_SQUARED_ERROR:
+        measure_mean_squared_error = true;
+        continue;
+      case Metric::ROOT_MEAN_SQUARED_ERROR:
+        measure_root_mean_squared_error = true;
+        continue;
+      case Metric::MEAN_ABSOLUTE_ERROR:
+        measure_mean_absolute_error = true;
+        continue;
+      default:
+        throw mk_runtime_error(
+            "Initializing MetricsAttrs with unrecogonized metrics type");
+    }
   }
 }
-}
 
-  
-}
+} // namespace FlexFlow

From a091652370cb2a2c29d60100253fd6fba2882307 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Mon, 27 Jan 2025 20:57:10 -0800
Subject: [PATCH 21/42] branch merge and test fixes

---
 lib/kernels/include/kernels/accessor.h        |  76 ++++--------
 lib/kernels/include/kernels/flat_kernels.h    |   2 +-
 .../include/kernels/loss_function_kernels.h   |   2 +-
 .../include/kernels/managed_ff_stream.h       |   1 +
 .../kernels/managed_per_device_ff_handle.h    |   1 +
 lib/kernels/include/kernels/metrics_kernels.h |   6 +-
 lib/kernels/include/kernels/pool_2d_kernels.h |   2 +-
 lib/kernels/src/cuda/metrics_functions.cu     |  10 +-
 lib/kernels/src/hip/embedding_kernels.cpp     |  30 ++---
 .../test/src/test_batch_norm_kernel.cc        |   2 +-
 lib/kernels/test/src/test_concat_kernel.cc    |   2 +-
 lib/kernels/test/src/test_flat_kernel.cc      |   2 +-
 .../test/src/test_layer_norm_kernels.cc       |   2 +-
 .../src/test_managed_per_device_ff_handle.cc  |   7 +-
 lib/kernels/test/src/test_partition_kernel.cc |   2 +-
 lib/kernels/test/src/test_pool_2d_kernels.cc  |   2 +-
 lib/kernels/test/src/test_reduction_kernel.cc |   2 +-
 lib/kernels/test/src/test_reverse_kernels.cc  |   2 +-
 lib/kernels/test/src/test_split_kernel.cc     |   2 +-
 lib/kernels/test/src/test_transpose_kernel.cc |   3 +-
 lib/kernels/test/src/test_utils.cc            |  18 ++-
 lib/kernels/test/src/test_utils.h             |   2 +-
 .../test/src/test_local_cost_estimator.cc     | 115 +++++++++---------
 .../include/op-attrs/aggregate_op.enum.toml   |   2 -
 .../include/op-attrs/make_datatype_value.h    |  16 ---
 ...ke_datatype_value.cc => datatype_value.cc} |   2 +-
 .../test/src/op-attrs/datatype_value.cc       |  68 +++++++++++
 lib/pcg/include/pcg/metric.enum.toml          |  26 ++++
 lib/pcg/include/pcg/metric.h                  |  72 -----------
 lib/pcg/include/pcg/metric_attrs.h            |  28 +++++
 lib/pcg/include/pcg/strided_rectangle.h       |  17 ---
 lib/pcg/src/pcg/computation_graph_builder.cc  |   2 +-
 lib/pcg/src/pcg/metric.cc                     |   8 +-
 .../parallel_computation_graph_builder.cc     |   2 +-
 lib/pcg/src/pcg/strided_rectangle_side.cc     |  17 ---
 lib/pcg/src/strided_rectangle.cc              |  35 ------
 lib/pcg/test/src/test_machine_view.cc         |  74 -----------
 lib/pcg/test/src/test_strided_rectangle.cc    |  37 ------
 38 files changed, 263 insertions(+), 436 deletions(-)
 delete mode 100644 lib/op-attrs/include/op-attrs/make_datatype_value.h
 rename lib/op-attrs/src/op-attrs/{make_datatype_value.cc => datatype_value.cc} (92%)
 create mode 100644 lib/op-attrs/test/src/op-attrs/datatype_value.cc
 create mode 100644 lib/pcg/include/pcg/metric.enum.toml
 delete mode 100644 lib/pcg/include/pcg/metric.h
 create mode 100644 lib/pcg/include/pcg/metric_attrs.h
 delete mode 100644 lib/pcg/include/pcg/strided_rectangle.h
 delete mode 100644 lib/pcg/src/pcg/strided_rectangle_side.cc
 delete mode 100644 lib/pcg/src/strided_rectangle.cc
 delete mode 100644 lib/pcg/test/src/test_machine_view.cc
 delete mode 100644 lib/pcg/test/src/test_strided_rectangle.cc

diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index 487bc1f8f0..a6fc4129e0 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -11,6 +11,28 @@
 
 namespace FlexFlow {
 
+inline int calculate_accessor_offset(std::vector<int> const &indices,
+                                     ArrayShape const &shape) {
+  int offset = 0;
+  int multiplier = 1;
+
+  for (int i = 0; i < shape.num_dims(); i++) {
+    if (indices.at(i) >= shape.at(legion_dim_t{i})) {
+      throw mk_runtime_error(
+          fmt::format("In {} dimension, attempting to access index {} "
+                      "when only {} indexes exist",
+                      i,
+                      indices.at(i),
+                      shape.at(legion_dim_t{i})));
+    }
+
+    offset += indices.at(i) * multiplier;
+    multiplier *= shape.at(legion_dim_t{i});
+  }
+
+  return offset;
+}
+
 class GenericTensorAccessorR {
 public:
   template <DataType DT>
@@ -57,23 +79,7 @@ class GenericTensorAccessorR {
 
     using T = real_type_t<DT>;
     T const *data_ptr = static_cast<T const *>(this->ptr);
-
-    int offset = 0;
-    int multiplier = 1;
-    for (int i = 0; i < this->shape.num_dims(); i++) {
-      if (indices.at(i) >= this->shape.at(legion_dim_t{i})) {
-        throw mk_runtime_error(
-            fmt::format("In {} dimension, attempting to access index {} "
-                        "when only {} indexes exist",
-                        i,
-                        indices.at(i),
-                        this->shape.at(legion_dim_t{i})));
-      }
-
-      offset += indices.at(i) * multiplier;
-      multiplier *= this->shape.at(legion_dim_t{i});
-    }
-
+    int offset = calculate_accessor_offset(indices, this->shape);
     return data_ptr[offset];
   }
 
@@ -141,24 +147,8 @@ class GenericTensorAccessorW {
     }
 
     using T = real_type_t<DT>;
-
     T *data_ptr = static_cast<T *>(this->ptr);
-    int offset = 0;
-    int multiplier = 1;
-    for (int i = 0; i < this->shape.num_dims(); i++) {
-      if (indices.at(i) >= this->shape.at(legion_dim_t{i})) {
-        throw mk_runtime_error(
-            fmt::format("In {} dimension, attempting to access index {} "
-                        "when only {} indexes exist",
-                        i,
-                        indices.at(i),
-                        this->shape.at(legion_dim_t{i})));
-      }
-
-      offset += indices.at(i) * multiplier;
-      multiplier *= this->shape.at(legion_dim_t{i});
-    }
-
+    int offset = calculate_accessor_offset(indices, this->shape);
     return data_ptr[offset];
   }
 
@@ -179,24 +169,8 @@ class GenericTensorAccessorW {
     }
 
     using T = real_type_t<DT>;
-
     T const *data_ptr = static_cast<T const *>(this->ptr);
-    int offset = 0;
-    int multiplier = 1;
-    for (int i = 0; i < this->shape.num_dims(); i++) {
-      if (indices.at(i) >= this->shape.at(legion_dim_t{i})) {
-        throw mk_runtime_error(
-            fmt::format("In {} dimension, attempting to access index {} "
-                        "when only {} indexes exist",
-                        i,
-                        indices.at(i),
-                        this->shape.at(legion_dim_t{i})));
-      }
-
-      offset += indices.at(i) * multiplier;
-      multiplier *= this->shape.at(legion_dim_t{i});
-    }
-
+    int offset = calculate_accessor_offset(indices, this->shape);
     return data_ptr[offset];
   }
 
diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h
index d60a1a5157..54839bd7fa 100644
--- a/lib/kernels/include/kernels/flat_kernels.h
+++ b/lib/kernels/include/kernels/flat_kernels.h
@@ -10,7 +10,7 @@ void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR input,
                     float *output_ptr);
 
-void backward_kernel(cudaStream_t stream,
+void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR input,
                      float const *output_grad_ptr,
                      float *input_grad_ptr);
diff --git a/lib/kernels/include/kernels/loss_function_kernels.h b/lib/kernels/include/kernels/loss_function_kernels.h
index 9e0dbd4ba1..bab404f884 100644
--- a/lib/kernels/include/kernels/loss_function_kernels.h
+++ b/lib/kernels/include/kernels/loss_function_kernels.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
diff --git a/lib/kernels/include/kernels/managed_ff_stream.h b/lib/kernels/include/kernels/managed_ff_stream.h
index 26d5fb4911..7f103ea560 100644
--- a/lib/kernels/include/kernels/managed_ff_stream.h
+++ b/lib/kernels/include/kernels/managed_ff_stream.h
@@ -19,6 +19,7 @@ struct ManagedFFStream {
 
   ffStream_t const &raw_stream() const;
 
+private:
   void cleanup();
 
 private:
diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
index 035ea574de..9bd9370685 100644
--- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h
+++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
@@ -24,6 +24,7 @@ struct ManagedPerDeviceFFHandle {
 
   PerDeviceFFHandle const &raw_handle() const;
 
+private:
   void cleanup();
 
 private:
diff --git a/lib/kernels/include/kernels/metrics_kernels.h b/lib/kernels/include/kernels/metrics_kernels.h
index d961ee7503..430608db55 100644
--- a/lib/kernels/include/kernels/metrics_kernels.h
+++ b/lib/kernels/include/kernels/metrics_kernels.h
@@ -2,20 +2,20 @@
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H
 
 #include "kernels/perf_metrics.h"
-#include "pcg/metric.h"
+#include "pcg/metric_attrs.h"
 
 namespace FlexFlow {
 
 void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr,
                                                 int const *label_ptr,
-                                                MetricsAttrs const *me,
+                                                MetricsAttrs const &me,
                                                 int num_effective_samples,
                                                 int num_classes,
                                                 PerfMetrics &perf_zc);
 
 void update_metrics_label_kernel_wrapper(float const *logit_ptr,
                                          float const *label_ptr,
-                                         MetricsAttrs const *me,
+                                         MetricsAttrs const &me,
                                          int num_samples,
                                          int num_classes,
                                          PerfMetrics &perf_zc);
diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h
index ad0a52efb9..9650859a18 100644
--- a/lib/kernels/include/kernels/pool_2d_kernels.h
+++ b/lib/kernels/include/kernels/pool_2d_kernels.h
@@ -67,7 +67,7 @@ void forward_kernel(ffStream_t stream,
                     void const *input_ptr,
                     void *output_ptr);
 
-void backward_kernel(cudaStream_t stream,
+void backward_kernel(ffStream_t stream,
                      Pool2DPerDeviceState const &m,
                      void const *output_ptr,
                      void const *output_grad_ptr,
diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu
index 0250f829ec..112f84c90c 100644
--- a/lib/kernels/src/cuda/metrics_functions.cu
+++ b/lib/kernels/src/cuda/metrics_functions.cu
@@ -16,7 +16,7 @@
 #include "device.h"
 #include "kernels/metrics_kernels.h"
 #include "kernels/perf_metrics.h"
-#include "pcg/metric.h"
+#include "pcg/metric_attrs.h"
 
 namespace FlexFlow {
 
@@ -163,7 +163,7 @@ __global__ void update_metrics_label_kernel(float const *logits,
 
 void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr,
                                                 int const *label_ptr,
-                                                MetricsAttrs const *me,
+                                                MetricsAttrs const &me,
                                                 int num_effective_samples,
                                                 int num_classes,
                                                 PerfMetrics &perf_zc) {
@@ -179,7 +179,7 @@ void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr,
                                        CUDA_NUM_THREADS,
                                        0,
                                        stream>>>(
-      logit_ptr, label_ptr, perf_cuda, *me, num_effective_samples, num_classes);
+      logit_ptr, label_ptr, perf_cuda, me, num_effective_samples, num_classes);
   checkCUDA(cudaStreamSynchronize(stream));
   checkCUDA(cudaMemcpy(
       &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost));
@@ -188,7 +188,7 @@ void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr,
 
 void update_metrics_label_kernel_wrapper(float const *logit_ptr,
                                          float const *label_ptr,
-                                         MetricsAttrs const *me,
+                                         MetricsAttrs const &me,
                                          int num_samples,
                                          int num_classes,
                                          PerfMetrics &perf_zc) {
@@ -201,7 +201,7 @@ void update_metrics_label_kernel_wrapper(float const *logit_ptr,
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   update_metrics_label_kernel<<<GET_BLOCKS(num_samples), 256, 0, stream>>>(
-      logit_ptr, label_ptr, perf_cuda, *me, num_samples, num_classes);
+      logit_ptr, label_ptr, perf_cuda, me, num_samples, num_classes);
   checkCUDA(cudaStreamSynchronize(stream));
   checkCUDA(cudaMemcpy(
       &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost));
diff --git a/lib/kernels/src/hip/embedding_kernels.cpp b/lib/kernels/src/hip/embedding_kernels.cpp
index 7ca3149f2f..06b42d420a 100644
--- a/lib/kernels/src/hip/embedding_kernels.cpp
+++ b/lib/kernels/src/hip/embedding_kernels.cpp
@@ -364,8 +364,8 @@ struct ForwardKernel {
            weight.data_type == DataType::FLOAT ||
            weight.data_type == DataType::DOUBLE);
 
-    if (aggr == AggregateOp::NONE) {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr<TI, TD>),
+    if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -374,10 +374,11 @@ struct ForwardKernel {
                          output.get<TD>(),
                          weight.get<TD>(),
                          out_dim,
-                         batch_size);
+                         in_dim,
+                         batch_size,
+                         aggr);
     } else {
-      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr<TI, TD>),
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -386,9 +387,7 @@ struct ForwardKernel {
                          output.get<TD>(),
                          weight.get<TD>(),
                          out_dim,
-                         in_dim,
-                         batch_size,
-                         aggr);
+                         batch_size);
     }
   }
 }
@@ -408,8 +407,9 @@ struct BackwardKernel {
     assert(output.data_type == DataType::HALF ||
            output.data_type == DataType::FLOAT ||
            output.data_type == DataType::DOUBLE);
-    if (aggr == AggregateOp::NONE) {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr<TI, TD>),
+
+    if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -418,9 +418,11 @@ struct BackwardKernel {
                          output.get<TD>(),
                          weight_grad.get<TD>(),
                          out_dim,
-                         batch_size);
+                         in_dim,
+                         batch_size,
+                         aggr);
     } else {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr<TI, TD>),
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -429,9 +431,7 @@ struct BackwardKernel {
                          output.get<TD>(),
                          weight_grad.get<TD>(),
                          out_dim,
-                         in_dim,
-                         batch_size,
-                         aggr);
+                         batch_size);
     }
   }
 }
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index 03a3a1ad40..270fad7bb6 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -1,6 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/batch_norm_kernels.h"
-#include "op-attrs/make_datatype_value.h"
+#include "op-attrs/datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index 4607171a54..5447b12fc5 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -8,7 +8,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test concat kernel forward and backward") {
     size_t num_inputs = 2;
     size_t size_per_input = 10;
-    ff_dim_t concat_axis = ff_dim_t{1};
+    ff_dim_t concat_axis = ff_dim_t{nonnegative_int{1}};
 
     ManagedPerDeviceFFHandle managed_handle{
         /*workSpaceSize=*/1024 * 1024,
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index 0bb69aa1dc..bbeb349ced 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -1,6 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/flat_kernels.h"
-#include "op-attrs/make_datatype_value.h"
+#include "op-attrs/datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 7d7298f83d..80a046fe37 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -1,6 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/layer_norm_kernels.h"
-#include "op-attrs/make_datatype_value.h"
+#include "op-attrs/datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
index de3e5b72b1..d081a0b07c 100644
--- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
+++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
@@ -5,7 +5,8 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ManagedPerDeviceFFHandle") {
-    ManagedPerDeviceFFHandle base_handle{1024 * 1024, true};
+    ManagedPerDeviceFFHandle base_handle{/*workSpaceSize=*/1024 * 1024,
+                                         /*allowTensorOpMathConversion=*/true};
     PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle();
 
     SUBCASE("constructor") {
@@ -22,7 +23,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("move assignment operator") {
       SUBCASE("move assign to other") {
-        ManagedPerDeviceFFHandle new_handle{1024 * 1024, true};
+        ManagedPerDeviceFFHandle new_handle{
+            /*workSpaceSize=*/1024 * 1024,
+            /*allowTensorOpMathConversion=*/true};
         new_handle = std::move(base_handle);
 
         CHECK(&base_handle.raw_handle() == nullptr);
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
index e88c811803..25264b7a58 100644
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ b/lib/kernels/test/src/test_partition_kernel.cc
@@ -1,6 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/partition_kernels.h"
-#include "op-attrs/make_datatype_value.h"
+#include "op-attrs/datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index 00fa968235..eb0702a970 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -1,6 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/pool_2d_kernels.h"
-#include "op-attrs/make_datatype_value.h"
+#include "op-attrs/datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index 1c389cb20d..a33748c0de 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -1,6 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/reduction_kernels.h"
-#include "op-attrs/make_datatype_value.h"
+#include "op-attrs/datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index 4adf79847a..c06919d603 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -1,7 +1,7 @@
 #include "doctest/doctest.h"
 #include "kernels/reverse_kernels.h"
 #include "kernels/reverse_kernels_cpu.h"
-#include "op-attrs/make_datatype_value.h"
+#include "op-attrs/datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index 34993fa151..e94d102b71 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -1,6 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/split_kernels.h"
-#include "op-attrs/make_datatype_value.h"
+#include "op-attrs/datatype_value.h"
 #include "test_utils.h"
 #include "utils/containers/repeat.h"
 
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index 0bc85cb8e0..f87fb67921 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -7,7 +7,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Transpose Kernel Operations") {
     std::size_t num_dims = 2;
 
-    std::vector<ff_dim_t> perm = {ff_dim_t{0}, ff_dim_t{1}};
+    std::vector<ff_dim_t> perm = {ff_dim_t{nonnegative_int{0}},
+                                  ff_dim_t{nonnegative_int{1}}};
 
     ManagedPerDeviceFFHandle managed_handle{
         /*workSpaceSize=*/1024 * 1024,
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
index bfed1241ba..c75abd50ff 100644
--- a/lib/kernels/test/src/test_utils.cc
+++ b/lib/kernels/test/src/test_utils.cc
@@ -1,5 +1,6 @@
 #include "test_utils.h"
 #include "op-attrs/tensor_shape.h"
+#include "utils/join_strings.h"
 #include <random>
 
 namespace FlexFlow {
@@ -140,21 +141,16 @@ template <DataType DT>
 struct Print2DCPUAccessorR {
   void operator()(GenericTensorAccessorR const &accessor,
                   std::ostream &stream) {
-    using T = real_type_t<DT>;
-
-    T const *data_ptr = accessor.get<DT>();
     int rows = accessor.shape.at(legion_dim_t{0});
     int cols = accessor.shape.at(legion_dim_t{1});
 
-    for (int i = 0; i < rows; i++) {
-      for (int j = 0; j < cols; j++) {
-        stream << data_ptr[i * cols + j];
+    std::vector<int> indices(cols);
+    std::iota(indices.begin(), indices.end(), 0);
 
-        if (j < cols - 1) {
-          stream << " ";
-        }
-      }
-      stream << std::endl;
+    for (int i = 0; i < rows; i++) {
+      stream << join_strings(indices, " ", [&](int k) {
+        return accessor.at<DT>({i, k});
+      }) << std::endl;
     }
   }
 };
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
index 19599d2900..a41bfc3aff 100644
--- a/lib/kernels/test/src/test_utils.h
+++ b/lib/kernels/test/src/test_utils.h
@@ -15,7 +15,7 @@
 #include <string>
 #include <vector>
 
-using namespace FlexFlow;
+using namespace ::FlexFlow;
 
 GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
                                                        Allocator &allocator);
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index 512c1ef33b..9f8b4092c1 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -13,71 +13,70 @@
 //   TEST_CASE("Local Cost Estimator") {
 //     // local backing initialization
 //     ManagedPerDeviceFFHandle managed_handle{
-/*workSpaceSize=*/1024 * 1024,
-    /*allowTensorOpMathConversion=*/true
-}
-;
+//       /*workSpaceSize=*/1024 * 1024,
+//       /*allowTensorOpMathConversion=*/true};
 
-//     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
-//         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
-//         EnableProfiling::YES,
-//         ProfilingSettings{/*warmup_iters=*/0,
-//                           /*measure_iters=*/1}};
+//         RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
+//             DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+//             EnableProfiling::YES,
+//             ProfilingSettings{/*warmup_iters=*/0,
+//                               /*measure_iters=*/1}};
 
-//     LocalCostEstimator cost_estimator =
-//     LocalCostEstimator{runtime_arg_config};
+//         LocalCostEstimator cost_estimator =
+//         LocalCostEstimator{runtime_arg_config};
 
-//     SUBCASE("Estimate cost -- Attention Op") {
-//       int embed_dim = 32;
-//       int num_heads = 10;
-//       MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
-//           /*embed_dim=*/embed_dim,
-//           /*num_heads=*/num_heads,
-//           /*kdim=*/embed_dim,
-//           /*vdim=*/embed_dim,
-//           /*dropout=*/0.0,
-//           /*bias=*/true,
-//           /*add_bias_kv=*/false,
-//           /*add_zero_attn=*/false,
-//       };
+//         SUBCASE("Estimate cost -- Attention Op") {
+//           int embed_dim = 32;
+//           int num_heads = 10;
+//           MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
+//               /*embed_dim=*/embed_dim,
+//               /*num_heads=*/num_heads,
+//               /*kdim=*/embed_dim,
+//               /*vdim=*/embed_dim,
+//               /*dropout=*/0.0,
+//               /*bias=*/true,
+//               /*add_bias_kv=*/false,
+//               /*add_zero_attn=*/false,
+//           };
 
-//       size_t batch_size = 40;
-//       size_t seq_len = 48;
-//       size_t feature_size = 36;
+//           size_t batch_size = 40;
+//           size_t seq_len = 48;
+//           size_t feature_size = 36;
 
-//       DataType dtype = DataType::FLOAT;
-//       ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{
-//           TensorDims{FFOrdered<size_t>{batch_size, seq_len, feature_size}},
-//           DataType::FLOAT,
-//       });
+//           DataType dtype = DataType::FLOAT;
+//           ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{
+//               TensorDims{FFOrdered<size_t>{batch_size, seq_len,
+//               feature_size}}, DataType::FLOAT,
+//           });
 
-//       ParallelTensorShape weights_shape = throw_if_unexpected(
-//           get_weights_shape(attrs, inputs_shape, inputs_shape,
-//           inputs_shape));
-//       ParallelTensorAttrs weight_attrs =
-//           ParallelTensorAttrs{weights_shape,
-//                               /*sync_type=*/std::nullopt,
-//                               /*initializer=*/std::nullopt,
-//                               CreateGrad::YES};
+//           ParallelTensorShape weights_shape = throw_if_unexpected(
+//               get_weights_shape(attrs, inputs_shape, inputs_shape,
+//               inputs_shape));
+//           ParallelTensorAttrs weight_attrs =
+//               ParallelTensorAttrs{weights_shape,
+//                                   /*sync_type=*/std::nullopt,
+//                                   /*initializer=*/std::nullopt,
+//                                   CreateGrad::YES};
 
-//       ParallelTensorShape output_shape = throw_if_unexpected(
-//           get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape));
-//       ParallelTensorAttrs output_attrs =
-//           ParallelTensorAttrs{output_shape,
-//                               /*sync_type=*/std::nullopt,
-//                               /*initializer=*/std::nullopt,
-//                               CreateGrad::YES};
+//           ParallelTensorShape output_shape = throw_if_unexpected(
+//               get_output_shape(attrs, inputs_shape, inputs_shape,
+//               inputs_shape));
+//           ParallelTensorAttrs output_attrs =
+//               ParallelTensorAttrs{output_shape,
+//                                   /*sync_type=*/std::nullopt,
+//                                   /*initializer=*/std::nullopt,
+//                                   CreateGrad::YES};
 
-//       CostDetails result = cost_estimator.estimate_cost(
-//           PCGOperatorAttrs{attrs},
-//           std::vector<ParallelTensorShape>{
-//               inputs_shape, inputs_shape, inputs_shape},
-//           std::vector<ParallelTensorAttrs>{weight_attrs},
-//           std::vector<ParallelTensorAttrs>{output_attrs},
-//           make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1}));
+//           CostDetails result = cost_estimator.estimate_cost(
+//               PCGOperatorAttrs{attrs},
+//               std::vector<ParallelTensorShape>{
+//                   inputs_shape, inputs_shape, inputs_shape},
+//               std::vector<ParallelTensorAttrs>{weight_attrs},
+//               std::vector<ParallelTensorAttrs>{output_attrs},
+//               make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1}));
 
-//       CHECK(result.total_elapsed_time > 0);
-//       CHECK(result.total_mem_usage > 0);
+//           CHECK(result.total_elapsed_time > 0);
+//           CHECK(result.total_mem_usage > 0);
+//         }
+//       }
 //     }
-//   }
-// }
diff --git a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
index 2c524c120a..09ee99915d 100644
--- a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
+++ b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
@@ -13,5 +13,3 @@ name = "SUM"
 [[values]]
 name = "AVG"
 
-[[values]]
-name = "NONE"
diff --git a/lib/op-attrs/include/op-attrs/make_datatype_value.h b/lib/op-attrs/include/op-attrs/make_datatype_value.h
deleted file mode 100644
index af4792dd9e..0000000000
--- a/lib/op-attrs/include/op-attrs/make_datatype_value.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
-#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
-
-#include "op-attrs/datatype_value.dtg.h"
-
-namespace FlexFlow {
-
-DataTypeValue make_float_data_type_value(float value);
-DataTypeValue make_double_data_type_value(double value);
-DataTypeValue make_int32_data_type_value(int32_t value);
-DataTypeValue make_int64_data_type_value(int64_t value);
-DataTypeValue make_bool_data_type_value(bool value);
-
-} // namespace FlexFlow
-
-#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
diff --git a/lib/op-attrs/src/op-attrs/make_datatype_value.cc b/lib/op-attrs/src/op-attrs/datatype_value.cc
similarity index 92%
rename from lib/op-attrs/src/op-attrs/make_datatype_value.cc
rename to lib/op-attrs/src/op-attrs/datatype_value.cc
index 76d712949a..4604ef0b4e 100644
--- a/lib/op-attrs/src/op-attrs/make_datatype_value.cc
+++ b/lib/op-attrs/src/op-attrs/datatype_value.cc
@@ -1,4 +1,4 @@
-#include "op-attrs/make_datatype_value.h"
+#include "op-attrs/datatype_value.h"
 
 namespace FlexFlow {
 
diff --git a/lib/op-attrs/test/src/op-attrs/datatype_value.cc b/lib/op-attrs/test/src/op-attrs/datatype_value.cc
new file mode 100644
index 0000000000..9b0e90b601
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/datatype_value.cc
@@ -0,0 +1,68 @@
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("test make_data_type_value") {
+    SUBCASE("make_float_data_type_value") {
+      float value = 1.0f;
+      DataTypeValue data_type_value = make_float_data_type_value(value);
+
+      CHECK(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<float>() == value);
+    }
+
+    SUBCASE("make_double_data_type_value") {
+      double value = 2.71828;
+      DataTypeValue data_type_value = make_double_data_type_value(value);
+
+      CHECK(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<double>() == value);
+    }
+
+    SUBCASE("make_int32_data_type_value") {
+      int32_t value = -42;
+      DataTypeValue data_type_value = make_int32_data_type_value(value);
+
+      CHECK(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<int32_t>() == value);
+    }
+
+    SUBCASE("make_int64_data_type_value") {
+      int64_t value = 1LL << 40;
+      DataTypeValue data_type_value = make_int64_data_type_value(value);
+
+      CHECK(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<int64_t>() == value);
+    }
+
+    SUBCASE("make_bool_data_type_value") {
+      bool value = true;
+      DataTypeValue data_type_value = make_bool_data_type_value(value);
+
+      CHECK(data_type_value.has<bool>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK(data_type_value.get<bool>() == value);
+    }
+  }
+}
diff --git a/lib/pcg/include/pcg/metric.enum.toml b/lib/pcg/include/pcg/metric.enum.toml
new file mode 100644
index 0000000000..ebb2323203
--- /dev/null
+++ b/lib/pcg/include/pcg/metric.enum.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "Metric"
+features = [
+  "hash",
+  "json",
+  "rapidcheck",
+  "fmt",
+]
+
+[[values]]
+name = "ACCURACY"
+
+[[values]]
+name = "CATEGORICAL_CROSSENTROPY"
+
+[[values]]
+name = "SPARSE_CATEGORICAL_CROSSENTROPY"
+
+[[values]]
+name = "MEAN_SQUARED_ERROR"
+
+[[values]]
+name = "ROOT_MEAN_SQUARED_ERROR"
+
+[[values]]
+name = "MEAN_ABSOLUTE_ERROR"
diff --git a/lib/pcg/include/pcg/metric.h b/lib/pcg/include/pcg/metric.h
deleted file mode 100644
index 718919112f..0000000000
--- a/lib/pcg/include/pcg/metric.h
+++ /dev/null
@@ -1,72 +0,0 @@
-#ifndef _FF_METRICS_H_
-#define _FF_METRICS_H_
-
-#include "op-attrs/ops/loss_functions/loss_functions.h"
-#include "utils/fmt.h"
-#include <unordered_set>
-
-namespace FlexFlow {
-
-enum class Metric {
-  ACCURACY,
-  CATEGORICAL_CROSSENTROPY,
-  SPARSE_CATEGORICAL_CROSSENTROPY,
-  MEAN_SQUARED_ERROR,
-  ROOT_MEAN_SQUARED_ERROR,
-  MEAN_ABSOLUTE_ERROR,
-};
-
-class MetricsAttrs {
-public:
-  MetricsAttrs() = delete;
-  MetricsAttrs(LossFunction, std::vector<Metric> const &);
-
-public:
-  LossFunction loss_type;
-  bool measure_accuracy;
-  bool measure_categorical_crossentropy;
-  bool measure_sparse_categorical_crossentropy;
-  bool measure_mean_squared_error;
-  bool measure_root_mean_squared_error;
-  bool measure_mean_absolute_error;
-};
-
-} // namespace FlexFlow
-
-namespace fmt {
-
-template <>
-struct formatter<::FlexFlow::Metric> : formatter<string_view> {
-  template <typename FormatContext>
-  auto format(::FlexFlow::Metric m, FormatContext &ctx) const
-      -> decltype(ctx.out()) {
-    using namespace FlexFlow;
-
-    string_view name = "unknown";
-    switch (m) {
-      case Metric::ACCURACY:
-        name = "Accuracy";
-        break;
-      case Metric::CATEGORICAL_CROSSENTROPY:
-        name = "CategoricalCrossEntropy";
-        break;
-      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
-        name = "SparseCategoricalCrossEntropy";
-        break;
-      case Metric::MEAN_SQUARED_ERROR:
-        name = "MeanSquaredError";
-        break;
-      case Metric::ROOT_MEAN_SQUARED_ERROR:
-        name = "RootMeanSquaredError";
-        break;
-      case Metric::MEAN_ABSOLUTE_ERROR:
-        name = "MeanAbsoluteError";
-        break;
-    }
-    return formatter<string_view>::format(name, ctx);
-  }
-};
-
-} // namespace fmt
-
-#endif
diff --git a/lib/pcg/include/pcg/metric_attrs.h b/lib/pcg/include/pcg/metric_attrs.h
new file mode 100644
index 0000000000..343c2154dd
--- /dev/null
+++ b/lib/pcg/include/pcg/metric_attrs.h
@@ -0,0 +1,28 @@
+#ifndef _FF_METRICS_H_
+#define _FF_METRICS_H_
+
+#include "op-attrs/ops/loss_functions/loss_functions.h"
+#include "pcg/metric.dtg.h"
+#include "utils/fmt.h"
+#include <unordered_set>
+
+namespace FlexFlow {
+
+class MetricsAttrs {
+public:
+  MetricsAttrs() = delete;
+  MetricsAttrs(LossFunction, std::unordered_set<Metric> const &);
+
+public:
+  LossFunction loss_type;
+  bool measure_accuracy;
+  bool measure_categorical_crossentropy;
+  bool measure_sparse_categorical_crossentropy;
+  bool measure_mean_squared_error;
+  bool measure_root_mean_squared_error;
+  bool measure_mean_absolute_error;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/pcg/include/pcg/strided_rectangle.h b/lib/pcg/include/pcg/strided_rectangle.h
deleted file mode 100644
index 9c3b8eeda9..0000000000
--- a/lib/pcg/include/pcg/strided_rectangle.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef _FLEXFLOW_PCG_INCLUDE_PCG_STRIDED_RECTANGLE_H
-#define _FLEXFLOW_PCG_INCLUDE_PCG_STRIDED_RECTANGLE_H
-
-#include "op-attrs/ff_dim.dtg.h"
-#include "pcg/side_size_t.dtg.h"
-#include "pcg/strided_rectangle.dtg.h"
-
-namespace FlexFlow {
-
-size_t get_num_dims(StridedRectangle const &);
-StridedRectangleSide get_side_at_idx(StridedRectangle const &rect,
-                                     ff_dim_t const &idx);
-num_points_t get_num_points(StridedRectangle const &rect);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc
index 7ff5bec2f7..09772fa9d9 100644
--- a/lib/pcg/src/pcg/computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/computation_graph_builder.cc
@@ -1,9 +1,9 @@
 #include "pcg/computation_graph_builder.h"
 #include "op-attrs/computation_graph_op_attrs.h"
+#include "op-attrs/datatype_value.h"
 #include "op-attrs/get_incoming_tensor_roles.h"
 #include "op-attrs/get_op_type.h"
 #include "op-attrs/get_output_shapes.h"
-#include "op-attrs/make_datatype_value.h"
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/ops/batch_norm.h"
 #include "op-attrs/ops/broadcast.h"
diff --git a/lib/pcg/src/pcg/metric.cc b/lib/pcg/src/pcg/metric.cc
index 69aba90d12..9a93e75350 100644
--- a/lib/pcg/src/pcg/metric.cc
+++ b/lib/pcg/src/pcg/metric.cc
@@ -1,8 +1,8 @@
-#include "pcg/metric.h"
+#include "pcg/metric_attrs.h"
 
 namespace FlexFlow {
 MetricsAttrs::MetricsAttrs(LossFunction _loss_type,
-                           std::vector<Metric> const &metrics)
+                           std::unordered_set<Metric> const &metrics)
     : loss_type(_loss_type), measure_accuracy(false),
       measure_categorical_crossentropy(false),
       measure_sparse_categorical_crossentropy(false),
@@ -29,8 +29,8 @@ MetricsAttrs::MetricsAttrs(LossFunction _loss_type,
         measure_mean_absolute_error = true;
         continue;
       default:
-        throw mk_runtime_error(
-            "Initializing MetricsAttrs with unrecogonized metrics type");
+        throw mk_runtime_error(fmt::format(
+            "Initializing MetricsAttrs with unrecogonized metrics type {}", m));
     }
   }
 }
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
index 79ac43ae66..e2f4555328 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
@@ -1,6 +1,6 @@
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "op-attrs/datatype_value.h"
 #include "op-attrs/get_incoming_tensor_roles.h"
-#include "op-attrs/make_datatype_value.h"
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/ops/batch_matmul.h"
 #include "op-attrs/ops/batch_norm.h"
diff --git a/lib/pcg/src/pcg/strided_rectangle_side.cc b/lib/pcg/src/pcg/strided_rectangle_side.cc
deleted file mode 100644
index e6caf4cb86..0000000000
--- a/lib/pcg/src/pcg/strided_rectangle_side.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "pcg/strided_rectangle_side.h"
-#include "utils/exception.h"
-
-namespace FlexFlow {
-
-StridedRectangleSide strided_side_from_size_and_stride(side_size_t side_size,
-                                                       int stride) {
-  assert((side_size.unwrapped % stride) == 0);
-  return StridedRectangleSide{num_points_t{side_size.unwrapped / stride},
-                              stride};
-}
-
-side_size_t get_side_size(StridedRectangleSide const &s) {
-  return side_size_t{s.num_points.unwrapped * s.stride};
-}
-
-} // namespace FlexFlow
diff --git a/lib/pcg/src/strided_rectangle.cc b/lib/pcg/src/strided_rectangle.cc
deleted file mode 100644
index 1c61424ab9..0000000000
--- a/lib/pcg/src/strided_rectangle.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "pcg/strided_rectangle.h"
-#include "op-attrs/dim_ordered/transform.h"
-#include "utils/containers.h"
-
-namespace FlexFlow {
-
-/* size_t StridedRectangle::at(FFOrdered<num_points_t> const &coord) const { */
-/*   assert(coord.size() == this->num_dims()); */
-
-/*   size_t _1d_stride = 1; */
-/*   size_t idx = 0; */
-/*   for (auto dim : inner_to_outer_idxs(this->sides)) { */
-/*     idx += this->sides.at(dim).at(coord.at(dim)).value() * _1d_stride; */
-/*     _1d_stride *= this->sides.at(dim).get_size().value(); */
-/*   } */
-/*   return idx; */
-/* } */
-
-size_t get_num_dims(StridedRectangle const &rect) {
-  return rect.sides.size();
-}
-
-num_points_t get_num_points(StridedRectangle const &rect) {
-  return num_points_t{
-      product(transform(rect.sides, [](StridedRectangleSide const &side) {
-        return side.num_points.unwrapped;
-      }))};
-}
-
-StridedRectangleSide get_side_at_idx(StridedRectangle const &rect,
-                                     ff_dim_t const &idx) {
-  return rect.sides.at(idx);
-}
-
-} // namespace FlexFlow
diff --git a/lib/pcg/test/src/test_machine_view.cc b/lib/pcg/test/src/test_machine_view.cc
deleted file mode 100644
index 92a96d5e9a..0000000000
--- a/lib/pcg/test/src/test_machine_view.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-#include "doctest/doctest.h"
-#include "pcg/machine_view.h"
-#include "pcg/strided_rectangle.h"
-#include "pcg/strided_rectangle_side.h"
-
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("MachineView general util functions") {
-    StridedRectangle rect{{StridedRectangleSide{num_points_t{7}, 5},
-                           StridedRectangleSide{num_points_t{10}, 2}}};
-    gpu_id_t start(1);
-    MachineView mv{device_id_t{start}, rect};
-    SUBCASE("num_dims") {
-      CHECK(num_dims(mv) == 2);
-    }
-    SUBCASE("num_devices") {
-      CHECK(num_devices(mv) == 7 * 10);
-    }
-    SUBCASE("get_device_type") {
-      CHECK(get_device_type(mv) == DeviceType::GPU);
-    }
-  }
-
-  TEST_CASE("MachineView make_1d_machine_view - GPU") {
-    StridedRectangle rect{{StridedRectangleSide{num_points_t{7}, 5}}};
-    device_id_t start_gpu{gpu_id_t{1}};
-    MachineView gpu_mv{start_gpu, rect};
-
-    SUBCASE("make_1d_machine_view(gpu_id_t start, gpu_id_t stop, int stride)") {
-      MachineView result =
-          make_1d_machine_view(start_gpu, device_id_t{gpu_id_t(1 + 7 * 5)}, 5);
-      MachineView correct = gpu_mv;
-      CHECK(result == correct);
-    }
-    SUBCASE("make_1d_machine_view(gpu_id_t start, num_points_t num_points, int "
-            "stride)") {
-      MachineView result = make_1d_machine_view(start_gpu, num_points_t{7}, 5);
-      MachineView correct = gpu_mv;
-      CHECK(result == correct);
-    }
-    SUBCASE("make_1d_machine_view(gpu_id_t start, side_size_t interval_size, "
-            "int stride)") {
-      MachineView result = make_1d_machine_view(
-          start_gpu, get_side_size(rect.sides.at(ff_dim_t{0})), 5);
-      MachineView correct = gpu_mv;
-      CHECK(result == correct);
-    }
-  }
-
-  TEST_CASE("MachineView make_1d_machine_view - CPU") {
-    StridedRectangle rect{{StridedRectangleSide{num_points_t{11}, 4}}};
-    device_id_t start_cpu{cpu_id_t{2}};
-    MachineView cpu_mv{start_cpu, rect};
-
-    SUBCASE("make_1d_machine_view(cpu_id_t start, cpu_id_t stop, int stride)") {
-      MachineView result =
-          make_1d_machine_view(start_cpu, device_id_t{cpu_id_t(2 + 11 * 4)}, 4);
-      MachineView correct = cpu_mv;
-      CHECK(result == correct);
-    }
-    SUBCASE("make_1d_machine_view(cpu_id_t start, num_points_t num_points, int "
-            "stride)") {
-      MachineView result = make_1d_machine_view(start_cpu, num_points_t{11}, 4);
-      MachineView correct = cpu_mv;
-      CHECK(result == correct);
-    }
-    SUBCASE("make_1d_machine_view(cpu_id_t start, side_size_t interval_size, "
-            "int stride)") {
-      MachineView result = make_1d_machine_view(
-          start_cpu, get_side_size(rect.sides.at(ff_dim_t{0})), 4);
-      MachineView correct = cpu_mv;
-      CHECK(result == correct);
-    }
-  }
-}
diff --git a/lib/pcg/test/src/test_strided_rectangle.cc b/lib/pcg/test/src/test_strided_rectangle.cc
deleted file mode 100644
index ef342944de..0000000000
--- a/lib/pcg/test/src/test_strided_rectangle.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-#include "doctest/doctest.h"
-#include "pcg/strided_rectangle.h"
-#include "pcg/strided_rectangle_side.h"
-
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("get_side_size(StridedRectangleSide)") {
-    StridedRectangleSide side{num_points_t{7}, 5};
-
-    CHECK(get_side_size(side) == side_size_t{7 * 5});
-  }
-  TEST_CASE("strided_side_from_size_and_stride") {
-    StridedRectangleSide correct{num_points_t{10}, 3};
-    StridedRectangleSide result =
-        strided_side_from_size_and_stride(side_size_t{10 * 3}, 3);
-    CHECK(result == correct);
-  }
-
-  TEST_CASE("StridedRectangle - helper functions") {
-
-    StridedRectangleSide s0{num_points_t{7}, 5};
-    StridedRectangleSide s1{num_points_t{10}, 2};
-    StridedRectangleSide s2{num_points_t{8}, 1};
-    StridedRectangle rect{{s0, s1, s2}};
-
-    SUBCASE("get_num_dims") {
-      CHECK(get_num_dims(rect) == 3);
-    }
-    SUBCASE("get_num_points") {
-      CHECK(get_num_points(rect) == num_points_t{7 * 8 * 10});
-    }
-    SUBCASE("get_side_at_idx") {
-      CHECK(get_side_at_idx(rect, ff_dim_t{0}) == s0);
-      CHECK(get_side_at_idx(rect, ff_dim_t{1}) == s1);
-      CHECK(get_side_at_idx(rect, ff_dim_t{2}) == s2);
-    }
-  }
-}

From 8860adfc61a17a5bcb23075f90c0661d74589d07 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Tue, 28 Jan 2025 18:45:06 -0800
Subject: [PATCH 22/42] build issues

---
 .envrc                             | 3 ---
 .vimrc                             | 8 --------
 lib/kernels/test/src/test_utils.cc | 3 +--
 3 files changed, 1 insertion(+), 13 deletions(-)
 delete mode 100644 .envrc
 delete mode 100644 .vimrc

diff --git a/.envrc b/.envrc
deleted file mode 100644
index 2797f0f929..0000000000
--- a/.envrc
+++ /dev/null
@@ -1,3 +0,0 @@
-source_up_if_exists
-
-use flake
diff --git a/.vimrc b/.vimrc
deleted file mode 100644
index 4c8a8a8279..0000000000
--- a/.vimrc
+++ /dev/null
@@ -1,8 +0,0 @@
-" example search path configuration
-set path=lib/runtime/**,lib/**
-
-" set build target
-" let g:target = "pcg"
-
-" set test target
-" let g:test_target = "utils-test"
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
index c75abd50ff..a15447446a 100644
--- a/lib/kernels/test/src/test_utils.cc
+++ b/lib/kernels/test/src/test_utils.cc
@@ -3,7 +3,7 @@
 #include "utils/join_strings.h"
 #include <random>
 
-namespace FlexFlow {
+using namespace ::FlexFlow;
 
 GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
                                                      Allocator &allocator) {
@@ -242,4 +242,3 @@ GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape,
       create_filled_accessor_w(shape, allocator, val);
   return read_only_accessor_from_write_accessor(w_accessor);
 }
-} // namespace FlexFlow

From 7b74acc66b00b9e3380cab3598345660ceb8d5a1 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Wed, 29 Jan 2025 19:39:32 -0800
Subject: [PATCH 23/42] Add AWS linux AMI to runs-on for testing (#1589)

---
 .github/runs-on.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/runs-on.yml b/.github/runs-on.yml
index 14f75549dd..b558b5131a 100644
--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -1,4 +1,10 @@
 images:
+  amazon-linux-gpu-x64:
+    platform: "linux"
+    arch: "x64"
+    owner: "898082745236" # AWS
+    name: "Amazon Linux 2 AMI with NVIDIA TESLA GPU Driver*"
+
   dlami-x64:
     platform: "linux"
     arch: "x64"
@@ -8,4 +14,4 @@ images:
 runners:
   gpu-nvidia:
     family: ["g4dn.xlarge"]
-    image: dlami-x64
+    image: amazon-linux-gpu-x64

From 8cdc677f2fbaa85d55577c846ed0e644ab47e272 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Thu, 30 Jan 2025 13:57:39 -0800
Subject: [PATCH 24/42] Pin runs-on images (#1590)

---
 .github/runs-on.yml | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/.github/runs-on.yml b/.github/runs-on.yml
index b558b5131a..6312b64955 100644
--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -1,17 +1,12 @@
 images:
-  amazon-linux-gpu-x64:
+  runs-on-gpu-pinned:
     platform: "linux"
     arch: "x64"
-    owner: "898082745236" # AWS
-    name: "Amazon Linux 2 AMI with NVIDIA TESLA GPU Driver*"
+    owner: "135269210855" # runs-on
+    name: "runs-on-v2.2-ubuntu22-gpu-x64-20250123194414"
 
-  dlami-x64:
+  runs-on-cpu-pinned:
     platform: "linux"
     arch: "x64"
-    owner: "898082745236" # AWS
-    name: "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*"
-
-runners:
-  gpu-nvidia:
-    family: ["g4dn.xlarge"]
-    image: amazon-linux-gpu-x64
+    owner: "135269210855" # runs-on
+    name: "runs-on-v2.2-ubuntu22-full-x64-20250101080516"

From 209db7ee4434ceb1a2bc700a583bd35d2039aa30 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Fri, 31 Jan 2025 00:20:51 -0800
Subject: [PATCH 25/42] GPU CI Fix (Pin runs-on GPU image) (#1588)

* Debug

* Change to base DL AMI

* Print disk usage

* Run nvidia-smi

* Remove excess cuda installs in base ami

* Re-enable freeing space in GPU CI

* Try updating nix-develop version

* Check what happens if you just enter the non-nixGL environment

* Try switching AMIs

* Try to remove the module stuff

* Move to lockshaw/develop-action

* Try pointing at a fixed commit

* Update nix-develop action

* Update nix-develop action to use BASH_FUNC filtering

* Remove all the /usr/local/cuda entries

* Switch back to gpu-ci env

* Update the cuda arch

* Try out the new runs-on gpu image

* Move over to pinned runs-on image

* Remove a bunch more unnecessary stuff in image to get back disk space

* Try using an emphemeral store

* Try mounting

* Fix bug

* Try sudo

* Move nix into _work

* Rollback all unnecessary changes

* Re-enable waiting on cpu-ci
---
 .github/workflows/helpers/free_space_on_runner_gpu.sh | 8 --------
 .github/workflows/tests.yml                           | 9 +++++----
 2 files changed, 5 insertions(+), 12 deletions(-)
 delete mode 100755 .github/workflows/helpers/free_space_on_runner_gpu.sh

diff --git a/.github/workflows/helpers/free_space_on_runner_gpu.sh b/.github/workflows/helpers/free_space_on_runner_gpu.sh
deleted file mode 100755
index a382ee58f6..0000000000
--- a/.github/workflows/helpers/free_space_on_runner_gpu.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-set -x
-
-sudo rm -rf /usr/share/dotnet
-sudo rm -rf /usr/local/lib/android
-sudo rm -rf /opt/ghc
-sudo rm -rf "/usr/local/share/boost"
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 7e2dabd784..e2fc0b6df6 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -16,7 +16,7 @@ jobs:
           submodules: recursive
 
       - name: Free additional space on runner
-        run: ./.github/workflows/helpers/free_space_on_runner_gpu.sh
+        run: ./.github/workflows/helpers/free_space_on_runner.sh
 
       - name: Install nix
         uses: cachix/install-nix-action@v25
@@ -67,7 +67,7 @@ jobs:
     runs-on:
       - runs-on
       - family=g4dn.xlarge
-      - image=ubuntu22-full-x64
+      - image=runs-on-gpu-pinned
 
     strategy:
       max-parallel: 1
@@ -79,8 +79,9 @@ jobs:
         with:
           submodules: recursive
 
-      - name: free additional space on runner
-        run: ./.github/workflows/helpers/free_space_on_runner_gpu.sh
+      - name: mount ephemeral drive to nix
+        run: |
+          sudo mkdir $HOME/_work/nix && sudo mkdir /nix && sudo mount --bind $HOME/_work/nix /nix
 
       - name: install nix
         uses: cachix/install-nix-action@v25

From 0d2ffdb278a6e0a204e94083d4dfaec5db249200 Mon Sep 17 00:00:00 2001
From: Victor Li <32348970+victorli2002@users.noreply.github.com>
Date: Sat, 1 Feb 2025 12:54:42 -0800
Subject: [PATCH 26/42] Merge substitution-builder (#1575)

* Start on pcg builder

* Add tests and some implementation for pcg builder

* Add pcg tests, make dtgen constructors explicit to fix bug

* Add remainder of PCG tests

* Fix build issues in local-execution

* Format

* Address Reyna comments, add topological_order function for PCG

* Pre multidigraph refactor

* Removing visitable from sp code

* Add open dataflow graph, start to replace pcg dataflow graph

* Start refactoring substitutions

* Add utility functions to support pattern matching

* Pre-refactor inputs

* Fix proj url

* Get back to substitutions, now with unordered graph inputs

* Get substitutions building

* substitutions-tests now builds

* Fix bug in filter, pass some initial substitution tests

* Add tests for fmt::to_string, fix some substitutions bugs

* Pass initial unit tests for find_pattern_matches

* Start on unit tests for pcg pattern

* Pass initial test for find_pattern_matches

* Fix small build issue in tests

* Format

* Sync tests in CI with tests in proj

* Fix minor build errors in kernels and local-execution

* Format

* Remove outdated code

* More outdated code removal

* More cleanup, add test for sp decomposition

* Pull apart containers.h

* More sp testing and fixes

* Break up graph algorithms.h

* Pre- full SP algo commit

* Add initial implementation and tests for cbc decomposition and inverse line graph

* Pass test for get_inverse_line_graph

* Add new multidigraph

* Fix get_inverse_line_graph to return a MultiDiGraph instead of a DiGraph

* Add tests for parallel and series reduction finding

* Add really rough implementation of valdez sp decomposition

* Fix local-execution build

* Add implementations and tests for applying series/parallel reductions

* Format

* Clean up sp decomposition interface and tests

* Format

* Add comments for top-level substitutions functions, add proj doxygen support

* Start sketching out substitutions code

* Fix build errors

* Add ability to permute node ids

* Cleanup and start to test new substitutions code

* Add test case for evaluate_substitution_output

* Add naive isomorphism detection code

* Add graph inputs to open dataflow graph isomorphism

* Add input permutation to evaluate_substitution_output

* Fix permute_node_ids

* Add test for permute_input_ids

* Migrate over to mutable implementation of apply_substitution

* Add fast isomorphism checking and an initial implementation of full substitution logic

* Pass initial full substitutions test

* Cleanup old isomorphism checking code

* Fix post-merge bugs

* Fix broken pcg builder test

* Format

* Reorganize code and remove some outdated code pre-code-review

* Format

* Restarting work on this after working on export-model-arch

* Adding in some a simple function to get the currently available substritutions

* nonnegative_int additions, code cleanup, etc.

* A bunch more moving over to nonnegative_int

* Even more nonnegative_int updating

* Fix build

* Fix failing tests

* Format

* Format

---------

Co-authored-by: Colin Unger <lockshaw@lockshaw.net>
Co-authored-by: Victor Li <vli42@sapling2.stanford.edu>
---
 .../src/export_model_arch.cc                  |  17 +-
 cmake/flexflow-utils.cmake                    |  14 +-
 flake.nix                                     |  14 +-
 ...omputation_graph_binary_sp_decomposition.h |   5 +-
 .../src/compiler/allowed_machine_views.cc     |  41 +-
 .../get_machine_resource_splits.cc            |  10 +-
 .../machine_mapping/machine_mapping.cc        |  10 +-
 ...el_layer_guid_oblivious_machine_mapping.cc |   4 +-
 ...mputation_graph_binary_sp_decomposition.cc |   2 +-
 .../test/src/allowed_machine_views.cc         |  60 +-
 ...racted_tensor_set_movement_across_split.cc |   8 +-
 .../get_machine_resource_splits.cc            | 193 ++---
 .../get_optimal_machine_mapping.cc            |  28 +-
 .../get_tensor_set_movement_across_split.cc   |  32 +-
 .../machine_mapping/machine_mapping.cc        |  24 +-
 .../get_machine_mapping_problem_tree.cc       |   6 +-
 .../machine_mapping/machine_mapping_result.cc |  36 +-
 ...get_optimal_machine_mapping_with_memory.cc |  28 +-
 .../machine_mapping_result_with_memory.cc     |  80 +--
 ...ion_graph_series_parallel_decomposition.cc |  64 +-
 .../task_graph_simulator/task_simulator.cc    |  88 ++-
 lib/compiler/test/src/graph_optimize_state.cc |  59 +-
 lib/kernels/include/kernels/array_shape.h     |  36 +-
 .../include/kernels/batch_norm_kernels.h      |  43 +-
 .../batch_norm_per_device_state.struct.toml   |  68 ++
 lib/kernels/include/kernels/legion_dim.h      |  10 +-
 .../include/kernels/legion_dim_t.struct.toml  |   7 +-
 .../kernels/per_device_op_state.variant.toml  |   5 -
 .../include/kernels/transpose_kernels.h       |  17 +-
 lib/kernels/src/allocation.cc                 |   3 +-
 lib/kernels/src/array_shape.cc                |  53 +-
 lib/kernels/src/cuda/cuda_helper.cu           |   8 +-
 .../src/cuda/ops/batch_norm_kernels.cu        |  32 +-
 lib/kernels/src/cuda/ops/cast_kernels.cu      |   4 +-
 lib/kernels/src/cuda/ops/combine_kernels.cu   |   5 +-
 lib/kernels/src/cuda/ops/concat_kernels.cu    |   7 +-
 lib/kernels/src/cuda/ops/conv_2d_kernels.cu   |  16 +-
 .../src/cuda/ops/element_unary_kernels.cu     |   8 +-
 lib/kernels/src/cuda/ops/flat_kernels.cu      |  12 +-
 lib/kernels/src/cuda/ops/gather_kernels.cu    |  54 +-
 lib/kernels/src/cuda/ops/partition_kernels.cu |  16 +-
 lib/kernels/src/cuda/ops/reduction_kernels.cu |   8 +-
 lib/kernels/src/cuda/ops/replicate_kernels.cu |   8 +-
 lib/kernels/src/cuda/ops/reshape_kernels.cu   |   7 +-
 lib/kernels/src/cuda/ops/transpose_kernels.cu |  99 +--
 lib/kernels/src/legion_dim.cc                 |   9 +-
 lib/kernels/test/src/test_attention_kernel.cc |  50 +-
 .../test/src/test_batch_matmul_kernel.cc      |  30 +-
 .../test/src/test_batch_norm_kernel.cc        |  58 +-
 lib/kernels/test/src/test_cast_kernel.cc      |   4 +-
 lib/kernels/test/src/test_combine_kernel.cc   |   2 +-
 lib/kernels/test/src/test_concat_kernel.cc    |   8 +-
 lib/kernels/test/src/test_dropout.cc          |   4 +-
 lib/kernels/test/src/test_flat_kernel.cc      |   2 +-
 lib/kernels/test/src/test_gather_kernels.cc   |   7 +-
 .../test/src/test_layer_norm_kernels.cc       |   8 +-
 lib/kernels/test/src/test_partition_kernel.cc |   2 +-
 lib/kernels/test/src/test_pool_2d_kernels.cc  |  52 +-
 lib/kernels/test/src/test_reduction_kernel.cc |   5 +-
 lib/kernels/test/src/test_replicate_kernel.cc |  12 +-
 lib/kernels/test/src/test_reshape_kernel.cc   |   2 +-
 lib/kernels/test/src/test_reverse_kernels.cc  |  51 +-
 lib/kernels/test/src/test_softmax_kernel.cc   |  19 +-
 lib/kernels/test/src/test_split_kernel.cc     |  12 +-
 lib/kernels/test/src/test_transpose_kernel.cc |  19 +-
 ...device_specific_device_states.variant.toml |   6 +-
 .../local-execution/legion_tensor_shape.h     |  40 --
 .../local-execution/task_id_t.enum.toml       |   3 -
 .../src/legion_tensor_shape.cc                |  15 -
 lib/local-execution/src/ops/attention.cc      |  55 +-
 lib/local-execution/src/ops/batch_matmul.cc   |  73 +-
 lib/local-execution/src/ops/batch_matmul.h    |   2 +-
 lib/local-execution/src/ops/batch_norm.cc     |  27 +-
 lib/local-execution/src/ops/conv_2d.cc        |  26 +-
 lib/local-execution/src/ops/gather.cc         |   9 +-
 lib/local-execution/src/ops/layer_norm.cc     |  24 +-
 lib/local-execution/src/ops/linear.cc         |  49 +-
 lib/local-execution/src/ops/pool_2d.cc        |  91 ++-
 lib/local-execution/src/ops/reduce.cc         |   9 +-
 lib/local-execution/src/ops/reduction.cc      |   4 +-
 lib/local-execution/src/ops/replicate.cc      |   4 +-
 lib/local-execution/src/ops/reverse.cc        |  46 +-
 lib/local-execution/src/ops/softmax.cc        |  20 +-
 lib/local-execution/src/ops/split.cc          |  51 +-
 lib/local-execution/src/ops/topk.cc           |  20 +-
 lib/local-execution/src/ops/transpose.cc      |  55 +-
 lib/local-execution/src/ops/transpose.h       |   3 -
 .../src/task_signature_impl.cc                |   4 -
 .../test/src/test_local_slots_backing.cc      |  13 +-
 .../test/src/test_local_task_arg_accessor.cc  |  13 +-
 .../test/src/test_task_registry.cc            |   8 +-
 .../models/bert/bert_config.struct.toml       |  15 +-
 .../candle_uno/candle_uno_config.struct.toml  |   9 +-
 .../inception_v3_config.struct.toml           |   8 +-
 .../include/models/split_test/split_test.h    |   2 +-
 .../transformer_config.struct.toml            |  21 +-
 lib/models/src/models/bert/bert.cc            |  41 +-
 .../src/models/candle_uno/candle_uno.cc       |  57 +-
 .../src/models/inception_v3/inception_v3.cc   | 675 +++++++++---------
 .../src/models/split_test/split_test.cc       |  16 +-
 .../src/models/transformer/transformer.cc     |  89 +--
 .../computation_graph_op_attrs.variant.toml   |   2 +-
 lib/op-attrs/include/op-attrs/datatype.h      |   3 +-
 .../op-attrs/dim_ordered/dim_ordered.h        |  27 +-
 .../include/op-attrs/dim_ordered/slice.h      |   4 +-
 lib/op-attrs/include/op-attrs/get_op_type.h   |   2 +-
 lib/op-attrs/include/op-attrs/ops/attention.h |  36 +-
 .../multihead_attention_inputs.struct.toml    |  12 +-
 .../op-attrs/ops/attention_attrs.struct.toml  |  12 +-
 .../include/op-attrs/ops/batch_matmul.h       |   2 +-
 .../op-attrs/ops/batch_matmul.struct.toml     |  19 -
 .../ops/batch_matmul_attrs.struct.toml        |  30 +
 .../op-attrs/ops/combine_attrs.struct.toml    |   3 +-
 .../conv_2d/conv_2d_input_shape.struct.toml   |   9 +-
 .../conv_2d_parallel_input_shape.struct.toml  |   5 +-
 .../op-attrs/ops/conv_2d_attrs.struct.toml    |  17 +-
 .../op-attrs/ops/embedding_attrs.struct.toml  |   7 +-
 .../op-attrs/ops/linear_attrs.struct.toml     |   3 +-
 lib/op-attrs/include/op-attrs/ops/pool_2d.h   |   4 +-
 .../op-attrs/ops/pool_2d_attrs.struct.toml    |  13 +-
 .../op-attrs/ops/reduction_attrs.struct.toml  |   6 +-
 .../ops/repartition_attrs.struct.toml         |   3 +-
 .../op-attrs/ops/replicate_attrs.struct.toml  |   6 +-
 .../op-attrs/ops/split_attrs.struct.toml      |   3 +-
 .../op-attrs/ops/topk_attrs.struct.toml       |   6 +-
 .../parallel_tensor_dim_degrees.struct.toml   |   3 +-
 .../include/op-attrs/parallel_tensor_dims.h   |  20 +-
 .../include/op-attrs/parallel_tensor_shape.h  |  25 +-
 .../discard_copy_degree.struct.toml           |   6 +-
 .../sum_degree.struct.toml                    |   6 +-
 .../op-attrs/pcg_operator_attrs.variant.toml  |   2 +-
 .../include/op-attrs/relative_ff_dim_t.h      |   2 +-
 .../op-attrs/replica_parallel_dim.struct.toml |   3 +-
 .../op-attrs/replica_parallel_dim_set.h       |   3 +-
 .../op-attrs/shard_parallel_dim.struct.toml   |   8 +-
 lib/op-attrs/include/op-attrs/tensor_dims.h   |   8 +-
 .../include/op-attrs/tensor_dims.struct.toml  |   4 +-
 lib/op-attrs/include/op-attrs/tensor_shape.h  |  10 +-
 lib/op-attrs/src/op-attrs/datatype.cc         |  15 +-
 lib/op-attrs/src/op-attrs/ff_dim_t.cc         |   2 +-
 lib/op-attrs/src/op-attrs/ops/attention.cc    | 474 ++----------
 .../attention/multihead_attention_inputs.cc   |  18 +-
 .../multihead_attention_parallel_inputs.cc    |   6 +-
 lib/op-attrs/src/op-attrs/ops/batch_matmul.cc |  21 +-
 lib/op-attrs/src/op-attrs/ops/batch_norm.cc   |  27 +-
 lib/op-attrs/src/op-attrs/ops/concat.cc       |  15 +-
 lib/op-attrs/src/op-attrs/ops/conv_2d.cc      |  65 +-
 .../ops/conv_2d/conv_2d_input_shape.cc        |   8 +-
 lib/op-attrs/src/op-attrs/ops/embedding.cc    |  24 +-
 lib/op-attrs/src/op-attrs/ops/flat.cc         |  18 +-
 lib/op-attrs/src/op-attrs/ops/layer_norm.cc   |   6 +-
 lib/op-attrs/src/op-attrs/ops/linear.cc       |  20 +-
 lib/op-attrs/src/op-attrs/ops/pool_2d.cc      |  73 +-
 .../src/op-attrs/parallel_tensor_dims.cc      |  39 +-
 .../src/op-attrs/parallel_tensor_shape.cc     |  64 +-
 .../src/op-attrs/relative_ff_dim_t.cc         |   4 +-
 .../src/op-attrs/replica_parallel_dim_set.cc  |   6 +-
 lib/op-attrs/src/op-attrs/tensor_dims.cc      |  15 +-
 lib/op-attrs/src/op-attrs/tensor_shape.cc     |  13 +-
 .../test/src/op-attrs/ops/attention.cc        | 178 +++--
 .../test/src/op-attrs/ops/batch_matmul.cc     | 144 ++--
 .../test/src/op-attrs/ops/batch_norm.cc       |  84 +--
 lib/op-attrs/test/src/op-attrs/ops/cast.cc    |  34 +-
 lib/op-attrs/test/src/op-attrs/ops/combine.cc |  20 +-
 lib/op-attrs/test/src/op-attrs/ops/concat.cc  | 176 ++---
 lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc | 168 ++---
 lib/op-attrs/test/src/op-attrs/ops/dropout.cc |  62 +-
 .../test/src/op-attrs/ops/element_binary.cc   |  70 +-
 .../test/src/op-attrs/ops/element_unary.cc    |  38 +-
 .../test/src/op-attrs/ops/embedding.cc        |  68 +-
 lib/op-attrs/test/src/op-attrs/ops/flat.cc    | 110 +--
 .../test/src/op-attrs/ops/layer_norm.cc       |  93 +--
 lib/op-attrs/test/src/op-attrs/ops/linear.cc  | 142 ++--
 lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc | 226 +++---
 .../test/src/op-attrs/ops/reduction.cc        |  16 +-
 .../test/src/op-attrs/ops/repartition.cc      |  16 +-
 .../test/src/op-attrs/ops/replicate.cc        |  17 +-
 lib/op-attrs/test/src/op-attrs/ops/softmax.cc |  78 +-
 .../test/src/op-attrs/pcg_operator_attrs.cc   |   4 +-
 .../test/src/op-attrs/relative_ff_dim_t.cc    |  10 +-
 lib/op-attrs/test/src/op-attrs/tensor_dims.cc |  31 +-
 .../include/pcg/computation_graph_builder.h   |  91 +--
 lib/pcg/include/pcg/cpu_id_t.struct.toml      |   6 +-
 lib/pcg/include/pcg/device_id.h               |   4 +-
 .../file_format/v1/graphs/v1_dataflow_graph.h |   2 +-
 .../v1/graphs/v1_dataflow_graph.struct.toml   |   3 +-
 .../v1/graphs/v1_graph_edge.struct.toml       |  12 +-
 .../v1/graphs/v1_labelled_dataflow_graph.h    |   9 +-
 .../v1_labelled_dataflow_graph.struct.toml    |   5 +-
 .../v1_binary_sp_decomposition.variant.toml   |   3 +-
 .../pcg/file_format/v1/v1_computation_graph.h |   2 +-
 lib/pcg/include/pcg/gpu_id_t.struct.toml      |   6 +-
 .../pcg/machine_space_coordinate.struct.toml  |   5 +-
 lib/pcg/include/pcg/machine_specification.h   |  12 +-
 .../pcg/machine_specification.struct.toml     |  10 +-
 lib/pcg/include/pcg/machine_view.h            |   2 +-
 lib/pcg/include/pcg/operator_task_space.h     |   4 +-
 .../pcg/operator_task_space.struct.toml       |   3 +-
 .../parallel_computation_graph_builder.h      |  38 +-
 .../parallel_computation_graph_edge.h         |   2 +-
 .../pcg/start_invariant_machine_view.h        |   2 +-
 lib/pcg/include/pcg/stride_t.struct.toml      |   6 +-
 .../pcg/task_space_coordinate.struct.toml     |   3 +-
 lib/pcg/src/pcg/computation_graph_builder.cc  | 137 ++--
 lib/pcg/src/pcg/device_id.cc                  |   4 +-
 .../v1/graphs/v1_dataflow_graph.cc            |   6 +-
 .../v1/graphs/v1_labelled_dataflow_graph.cc   |  16 +
 .../v1/v1_binary_sp_decomposition/json.cc     |   4 +-
 .../file_format/v1/v1_computation_graph.cc    |   7 +-
 lib/pcg/src/pcg/machine_space_offset.cc       |   6 +-
 lib/pcg/src/pcg/machine_specification.cc      |  17 +-
 lib/pcg/src/pcg/machine_view.cc               |  79 +-
 lib/pcg/src/pcg/operator_task_space.cc        |  22 +-
 .../generate_weight_transform.cc              |   4 +-
 .../parallel_computation_graph_builder.cc     |  98 +--
 .../parallel_computation_graph_edge.cc        |   2 +-
 .../src/pcg/start_invariant_machine_view.cc   |   7 +-
 lib/pcg/test/src/pcg/computation_graph.cc     |  40 +-
 .../test/src/pcg/computation_graph_builder.cc |  18 +-
 .../v1/v1_binary_sp_decomposition/json.cc     |  18 +-
 .../file_format/v1/v1_computation_graph.cc    |   8 +-
 .../v1/v1_parallel_computation_graph.cc       |  10 +-
 lib/pcg/test/src/pcg/machine_specification.cc |  17 +-
 lib/pcg/test/src/pcg/machine_view.cc          | 160 ++---
 lib/pcg/test/src/pcg/operator_task_space.cc   |  28 +-
 .../parallel_computation_graph.cc             |  38 +-
 .../parallel_computation_graph_builder.cc     | 155 ++--
 .../src/pcg/start_invariant_machine_view.cc   |  56 +-
 .../apply_substitution/apply_substitution.h   |  31 +
 .../evaluate_substitution_output.h            |   6 +-
 .../output_expr_to_result_sub_pcg_mapping.h   |   6 +-
 ...expr_to_result_sub_pcg_mapping.struct.toml |   0
 .../perform_shape_inference.h                 |   4 +-
 .../substitutions/constraint_type.enum.toml   |   3 +
 .../operator_pattern/get_attribute_map.h      |  15 +
 .../operator_attribute_constraint.h           |   2 +
 .../operator_attribute_key.enum.toml          |   1 +
 .../operator_pattern/operator_attribute_key.h |  12 +
 ...operator_attribute_list_access.struct.toml |   5 +-
 .../operator_attribute_value.variant.toml     |  14 +-
 .../output_graph/output_graph_expr.h          |   5 +
 .../output_graph/output_graph_expr_value.h    |  16 +
 .../output_graph_expr_value.variant.toml      |  19 +
 .../output_operator_attrs_assignment.h        |   3 +
 ...tput_operator_attrs_assignment.struct.toml |   7 +-
 .../include/substitutions/pcg_pattern.h       |   2 +
 .../include/substitutions/pcg_pattern_match.h |   4 +-
 .../sub_parallel_computation_graph_edge.h     |   2 +-
 .../include/substitutions/substitution.h      |  25 +-
 .../substitutions/substitution_builder.h      |  49 ++
 .../tensor_attribute_list_access.struct.toml  |   5 +-
 .../tensor_pattern/tensor_attribute_pattern.h |   3 +
 .../tensor_attribute_value.variant.toml       |   5 +-
 .../substitutions/unity_substitution_set.h    |  47 ++
 .../unlabelled/input_pattern_edge.h           |   2 +-
 .../unlabelled/pattern_matching.h             |  10 +-
 .../unlabelled/pattern_node_output.h          |   2 +-
 .../unlabelled/standard_pattern_edge.h        |   4 +-
 .../apply_substitution/apply_substitution.cc  | 165 +++++
 .../evaluate_substitution_output.cc           |   4 +-
 .../output_expr_to_result_sub_pcg_mapping.cc  |   2 +-
 .../perform_shape_inference.cc                |   2 +-
 .../operator_pattern/eval_list_access.cc      |  21 +-
 .../operator_pattern/eval_list_size.cc        |   5 +-
 .../operator_pattern/get_attribute.cc         | 156 ++--
 .../operator_pattern/get_attribute_map.cc     |  25 +
 .../operator_attribute_constraint.cc          |  10 +
 .../operator_attribute_key.cc                 |  68 ++
 .../materialize_operator_from_attrs_map.cc    |  27 +-
 .../output_graph/output_graph_expr.cc         |  18 +
 .../output_graph/output_graph_expr_value.cc   |  30 +
 .../output_operator_attrs_assignment.cc       |  41 +-
 .../src/substitutions/pcg_pattern.cc          |  18 +
 .../sub_parallel_computation_graph.cc         |  55 +-
 .../sub_parallel_computation_graph_edge.cc    |   2 +-
 .../src/substitutions/substitution.cc         | 301 ++++----
 .../src/substitutions/substitution_builder.cc | 162 +++++
 .../tensor_pattern/eval_list_access.cc        |   5 +-
 .../tensor_pattern/eval_list_size.cc          |   5 +-
 .../tensor_pattern/get_attribute.cc           |  10 +-
 .../tensor_attribute_pattern.cc               |  16 +
 .../substitutions/unity_substitution_set.cc   | 235 ++++++
 .../unlabelled/input_pattern_edge.cc          |   2 +-
 .../unlabelled/pattern_node_output.cc         |   2 +-
 .../unlabelled/standard_pattern_edge.cc       |   4 +-
 .../apply_substitution/apply_substitution.cc  | 174 +++++
 .../evaluate_substitution_output.cc           |  63 +-
 .../perform_shape_inference.cc                |  63 +-
 .../operator_pattern/get_attribute.cc         |   2 +-
 .../test/src/substitutions/pcg_pattern.cc     |  14 +-
 .../test/src/substitutions/substitution.cc    | 345 ++++-----
 .../src/substitutions/substitution_builder.cc | 145 ++++
 .../substitutions/unity_substitution_set.cc   |  20 +
 .../unlabelled/find_pattern_matches.cc}       |  29 +-
 .../unlabelled/pattern_matching.cc            | 210 ++++++
 .../substitutions/unlabelled/pattern_split.cc |   8 +-
 .../unlabelled/unlabelled_graph_pattern.cc    |   4 +-
 .../test/src/test_substitution.cc             | 148 ----
 .../algorithms/bidict_from_enumerating.h      |  14 +-
 .../utils/cli/cli_flag_key.struct.toml        |   6 +-
 .../cli_positional_argument_key.struct.toml   |   6 +-
 lib/utils/include/utils/containers/at_idx.h   |   5 +-
 .../include/utils/containers/enumerate.h      |  16 +-
 .../utils/containers/enumerate_vector.h       |  11 +-
 lib/utils/include/utils/containers/flatmap.h  |  15 +-
 .../get_all_permutations_with_repetition.h    |  10 +-
 lib/utils/include/utils/containers/make.h     |  13 +
 .../include/utils/containers/merge_maps.h     |  60 +-
 .../utils/containers/merge_method.enum.toml   |  17 +
 lib/utils/include/utils/containers/product.h  |   2 +-
 lib/utils/include/utils/containers/repeat.h   |   5 +-
 .../include/utils/containers/repeat_element.h |  22 +
 .../include/utils/containers/replicate.h      |  15 -
 lib/utils/include/utils/containers/sum.h      |   2 +-
 .../algorithms/view_as_open_dataflow_graph.h  |  34 +
 .../dataflow_edge_query.struct.toml           |   5 +-
 .../graph/dataflow_graph/dataflow_graph.h     |   3 +-
 .../dataflow_graph/dataflow_input.struct.toml |   3 +-
 .../dataflow_output.struct.toml               |   3 +-
 .../dataflow_output_query.struct.toml         |   6 +-
 .../graph/dataflow_graph/i_dataflow_graph.h   |   2 +-
 .../instances/unordered_set_dataflow_graph.h  |   4 +-
 ...ordered_set_labelled_open_dataflow_graph.h |   7 +-
 .../algorithms/get_graph_data.h               |   1 +
 .../algorithms/permute_node_ids.h             |   1 +
 .../graph/multidigraph/algorithms/add_nodes.h |   3 +-
 .../algorithms/are_isomorphic.h               |  13 +
 .../open_dataflow_graph_isomorphism.h         |  21 +
 .../dataflow_input_edge_query.struct.toml     |   3 +-
 .../i_open_dataflow_graph.h                   |   2 +-
 .../open_dataflow_graph/open_dataflow_edge.h  |   2 +-
 .../open_dataflow_graph/open_dataflow_graph.h |   2 +-
 .../unordered_set_open_dataflow_graph.h       |   2 +-
 lib/utils/include/utils/graph/render_dot.h    |  19 +
 .../include/utils/nonnegative_int/ceildiv.h   |  11 +
 .../utils/nonnegative_int/nonnegative_int.h   |  28 +-
 .../utils/nonnegative_int/nonnegative_range.h |  14 +
 .../utils/nonnegative_int/num_elements.h      |  17 +
 lib/utils/include/utils/variant.h             |   1 +
 .../algorithms/bidict_from_enumerating.cc     |  13 +
 lib/utils/src/utils/cli/cli_parse.cc          |   6 +-
 lib/utils/src/utils/cli/cli_spec.cc           |  15 +-
 lib/utils/src/utils/containers/at_idx.cc      |   9 +
 lib/utils/src/utils/containers/enumerate.cc   |  11 +
 .../src/utils/containers/enumerate_vector.cc  |   9 +
 lib/utils/src/utils/containers/make.cc        |   8 +
 lib/utils/src/utils/containers/range.cc       |   1 +
 lib/utils/src/utils/containers/repeat.cc      |  10 +
 .../src/utils/containers/repeat_element.cc    |  10 +
 lib/utils/src/utils/containers/replicate.cc   |   1 -
 .../utils/graph/dataflow_graph/algorithms.cc  |   2 +-
 .../graph/dataflow_graph/algorithms/as_dot.cc |  41 +-
 .../get_dataflow_edges_from_node_to_node.cc   |   4 +-
 .../algorithms/get_incoming_edges.cc          |   8 +-
 .../algorithms/get_outgoing_edges.cc          |   8 +-
 .../algorithms/get_subgraph_incoming_edges.cc |   4 +-
 .../algorithms/get_subgraph_outgoing_edges.cc |   4 +-
 .../algorithms/view_as_open_dataflow_graph.cc |  22 +-
 .../algorithms/view_as_open_dataflow_graph.h  |  32 -
 .../dataflow_graph/dataflow_edge_query.cc     |  20 +-
 .../graph/dataflow_graph/dataflow_graph.cc    |   2 +-
 .../dataflow_graph/dataflow_output_query.cc   |   6 +-
 .../dataflow_graph/i_dataflow_graph_view.cc   |   4 +-
 .../digraph/algorithms/transitive_closure.cc  |   5 +-
 .../algorithms/transitive_reduction.cc        |   4 +-
 .../instances/unordered_set_dataflow_graph.cc |  10 +-
 .../multidigraph/algorithms/add_nodes.cc      |   2 +-
 .../algorithms/are_isomorphic.cc              |  11 +
 .../open_dataflow_graph/algorithms/as_dot.cc  |  11 +-
 .../algorithms/get_incoming_edges.cc          |   6 +-
 .../algorithms/get_subgraph_incoming_edges.cc |   6 +-
 .../open_dataflow_graph_isomorphism.cc        |  54 ++
 .../dataflow_input_edge_query.cc              |  10 +-
 .../open_dataflow_graph/open_dataflow_edge.cc |   2 +-
 .../open_dataflow_graph.cc                    |   2 +-
 .../unordered_set_open_dataflow_graph.cc      |   2 +-
 lib/utils/src/utils/graph/render_dot.cc       |  90 +++
 .../src/utils/nonnegative_int/ceildiv.cc      |  20 +
 .../utils/nonnegative_int/nonnegative_int.cc  |  79 +-
 .../nonnegative_int/nonnegative_range.cc      |  19 +
 .../src/utils/nonnegative_int/num_elements.cc |  10 +
 lib/utils/test/src/main.cc                    |   2 -
 .../algorithms/bidict_from_enumerating.cc     |  19 +-
 lib/utils/test/src/utils/cli/cli_parse.cc     |  34 +-
 lib/utils/test/src/utils/containers/at_idx.cc |  29 +
 .../test/src/utils/containers/enumerate.cc    |  33 +-
 .../src/utils/containers/enumerate_vector.cc  |  33 +
 .../test/src/utils/containers/flatmap.cc      |  32 +
 .../get_all_permutations_with_repetition.cc   |   8 +-
 lib/utils/test/src/utils/containers/make.cc   |  15 +
 .../test/src/utils/containers/merge_maps.cc   |  78 +-
 .../test/src/utils/containers/product.cc      |  20 +
 lib/utils/test/src/utils/containers/repeat.cc |   2 +-
 .../{replicate.cc => repeat_element.cc}       |   9 +-
 .../utils/graph/dataflow_graph/algorithms.cc  |  14 +-
 .../dataflow_graphs_are_isomorphic.cc         |  24 +-
 .../algorithms/find_isomorphism.cc            |  24 +-
 .../get_dataflow_edges_from_node_to_node.cc   |  26 +-
 .../algorithms/get_incoming_edges.cc          |  14 +-
 .../algorithms/get_outgoing_edges.cc          |  28 +-
 .../algorithms/get_subgraph_incoming_edges.cc |  14 +-
 .../algorithms/get_subgraph_outgoing_edges.cc |  12 +-
 ...sitive_reduced_boundary_nodes_for_split.cc |   8 +-
 ...t_transitive_reduced_edges_across_split.cc |  34 +-
 ...transitive_reduced_outputs_across_split.cc |   8 +-
 .../unordered_open_dataflow_graph.cc          |   8 +-
 .../multidigraph/algorithms/add_edges.cc      |   2 +-
 .../multidigraph/algorithms/add_nodes.cc      |   2 +-
 .../multidigraph/algorithms/get_edges.cc      |   2 +-
 .../algorithms/find_isomorphism.cc            |  23 +-
 .../get_open_dataflow_graph_inputs.cc         |   2 +-
 .../get_open_dataflow_value_uses.cc           |  20 +-
 .../get_unused_open_dataflow_graph_inputs.cc  |   4 +-
 .../open_dataflow_graphs_are_isomorphic.cc    |  23 +-
 .../algorithms/permute_input_ids.cc           |  14 +-
 .../algorithms/permute_node_ids.cc            |  28 +-
 .../series_parallel/parallel_reduction.cc     |  14 +-
 .../graph/series_parallel/series_reduction.cc |  18 +-
 .../test/src/utils/nonnegative_int/ceildiv.cc |  52 ++
 .../utils/nonnegative_int/nonnegative_int.cc  |  90 ++-
 .../nonnegative_int/nonnegative_range.cc      |  42 ++
 .../src/utils/nonnegative_int/num_elements.cc |  15 +
 lib/utils/test/src/utils/random_utils.cc      |   6 +-
 423 files changed, 7336 insertions(+), 5040 deletions(-)
 create mode 100644 lib/kernels/include/kernels/batch_norm_per_device_state.struct.toml
 delete mode 100644 lib/local-execution/include/local-execution/legion_tensor_shape.h
 delete mode 100644 lib/local-execution/src/legion_tensor_shape.cc
 delete mode 100644 lib/op-attrs/include/op-attrs/ops/batch_matmul.struct.toml
 create mode 100644 lib/op-attrs/include/op-attrs/ops/batch_matmul_attrs.struct.toml
 create mode 100644 lib/substitutions/include/substitutions/apply_substitution/apply_substitution.h
 rename lib/substitutions/include/substitutions/{substitution_internal => apply_substitution}/evaluate_substitution_output.h (76%)
 rename lib/substitutions/include/substitutions/{substitution_internal => apply_substitution}/output_expr_to_result_sub_pcg_mapping.h (62%)
 rename lib/substitutions/include/substitutions/{substitution_internal => apply_substitution}/output_expr_to_result_sub_pcg_mapping.struct.toml (100%)
 rename lib/substitutions/include/substitutions/{substitution_internal => apply_substitution}/perform_shape_inference.h (85%)
 create mode 100644 lib/substitutions/include/substitutions/operator_pattern/get_attribute_map.h
 create mode 100644 lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.h
 create mode 100644 lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.h
 create mode 100644 lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.variant.toml
 create mode 100644 lib/substitutions/include/substitutions/substitution_builder.h
 create mode 100644 lib/substitutions/include/substitutions/unity_substitution_set.h
 create mode 100644 lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc
 rename lib/substitutions/src/substitutions/{substitution_internal => apply_substitution}/evaluate_substitution_output.cc (96%)
 rename lib/substitutions/src/substitutions/{substitution_internal => apply_substitution}/output_expr_to_result_sub_pcg_mapping.cc (93%)
 rename lib/substitutions/src/substitutions/{substitution_internal => apply_substitution}/perform_shape_inference.cc (95%)
 create mode 100644 lib/substitutions/src/substitutions/operator_pattern/get_attribute_map.cc
 create mode 100644 lib/substitutions/src/substitutions/operator_pattern/operator_attribute_key.cc
 create mode 100644 lib/substitutions/src/substitutions/output_graph/output_graph_expr_value.cc
 create mode 100644 lib/substitutions/src/substitutions/substitution_builder.cc
 create mode 100644 lib/substitutions/src/substitutions/unity_substitution_set.cc
 create mode 100644 lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc
 rename lib/substitutions/test/src/substitutions/{substitution_internal => apply_substitution}/evaluate_substitution_output.cc (86%)
 rename lib/substitutions/test/src/substitutions/{substitution_internal => apply_substitution}/perform_shape_inference.cc (78%)
 create mode 100644 lib/substitutions/test/src/substitutions/substitution_builder.cc
 create mode 100644 lib/substitutions/test/src/substitutions/unity_substitution_set.cc
 rename lib/substitutions/test/src/{test_pattern_matches.cc => substitutions/unlabelled/find_pattern_matches.cc} (94%)
 create mode 100644 lib/substitutions/test/src/substitutions/unlabelled/pattern_matching.cc
 delete mode 100644 lib/substitutions/test/src/test_substitution.cc
 create mode 100644 lib/utils/include/utils/containers/make.h
 create mode 100644 lib/utils/include/utils/containers/merge_method.enum.toml
 create mode 100644 lib/utils/include/utils/containers/repeat_element.h
 delete mode 100644 lib/utils/include/utils/containers/replicate.h
 create mode 100644 lib/utils/include/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h
 create mode 100644 lib/utils/include/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h
 create mode 100644 lib/utils/include/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h
 create mode 100644 lib/utils/include/utils/graph/render_dot.h
 create mode 100644 lib/utils/include/utils/nonnegative_int/ceildiv.h
 create mode 100644 lib/utils/include/utils/nonnegative_int/nonnegative_range.h
 create mode 100644 lib/utils/include/utils/nonnegative_int/num_elements.h
 create mode 100644 lib/utils/src/utils/containers/make.cc
 create mode 100644 lib/utils/src/utils/containers/repeat_element.cc
 delete mode 100644 lib/utils/src/utils/containers/replicate.cc
 delete mode 100644 lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h
 create mode 100644 lib/utils/src/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.cc
 create mode 100644 lib/utils/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.cc
 create mode 100644 lib/utils/src/utils/graph/render_dot.cc
 create mode 100644 lib/utils/src/utils/nonnegative_int/ceildiv.cc
 create mode 100644 lib/utils/src/utils/nonnegative_int/nonnegative_range.cc
 create mode 100644 lib/utils/src/utils/nonnegative_int/num_elements.cc
 delete mode 100644 lib/utils/test/src/main.cc
 create mode 100644 lib/utils/test/src/utils/containers/at_idx.cc
 create mode 100644 lib/utils/test/src/utils/containers/enumerate_vector.cc
 create mode 100644 lib/utils/test/src/utils/containers/make.cc
 rename lib/utils/test/src/utils/containers/{replicate.cc => repeat_element.cc} (69%)
 create mode 100644 lib/utils/test/src/utils/nonnegative_int/ceildiv.cc
 create mode 100644 lib/utils/test/src/utils/nonnegative_int/nonnegative_range.cc
 create mode 100644 lib/utils/test/src/utils/nonnegative_int/num_elements.cc

diff --git a/bin/export-model-arch/src/export_model_arch.cc b/bin/export-model-arch/src/export_model_arch.cc
index 64419acce4..a9f6c65b86 100644
--- a/bin/export-model-arch/src/export_model_arch.cc
+++ b/bin/export-model-arch/src/export_model_arch.cc
@@ -13,6 +13,7 @@
 #include "utils/cli/cli_parse.h"
 #include "utils/cli/cli_parse_result.h"
 #include "utils/cli/cli_spec.h"
+#include "utils/graph/open_dataflow_graph/algorithms/as_dot.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/right_associative_binary_sp_tree_from_nary.h"
 #include "utils/graph/series_parallel/get_series_parallel_decomposition.h"
 
@@ -21,11 +22,11 @@ using namespace ::FlexFlow;
 ComputationGraph get_single_operator_computation_graph() {
   ComputationGraphBuilder b;
 
-  size_t batch_size = 8;
-  size_t in_channels = 16;
-  size_t out_channels = 12;
+  nonnegative_int batch_size = 8_n;
+  nonnegative_int in_channels = 16_n;
+  nonnegative_int out_channels = 12_n;
   TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<size_t>{
+      TensorDims{FFOrdered<nonnegative_int>{
           batch_size,
           in_channels,
           out_channels,
@@ -69,7 +70,7 @@ tl::expected<ComputationGraph, std::string>
   } else if (model_name == "bert") {
     return get_bert_computation_graph(get_default_bert_config());
   } else if (model_name == "split_test") {
-    int batch_size = 8;
+    nonnegative_int batch_size = 8_n;
     return get_split_test_computation_graph(batch_size);
   } else if (model_name == "single_operator") {
     return get_single_operator_computation_graph();
@@ -100,10 +101,10 @@ tl::expected<JsonSPModelExport, std::string>
     result.value();
   });
 
-  std::pair<V1ComputationGraph, bidict<int, layer_guid_t>> v1_result =
-      to_v1_including_node_numbering(computation_graph);
+  std::pair<V1ComputationGraph, bidict<nonnegative_int, layer_guid_t>>
+      v1_result = to_v1_including_node_numbering(computation_graph);
   V1ComputationGraph v1_cg = v1_result.first;
-  bidict<int, layer_guid_t> layer_numbering = v1_result.second;
+  bidict<nonnegative_int, layer_guid_t> layer_numbering = v1_result.second;
   V1BinarySPDecomposition v1_sp_decomposition =
       to_v1(sp_decomposition, layer_numbering);
 
diff --git a/cmake/flexflow-utils.cmake b/cmake/flexflow-utils.cmake
index 7ba39e92c9..515a249521 100644
--- a/cmake/flexflow-utils.cmake
+++ b/cmake/flexflow-utils.cmake
@@ -20,6 +20,7 @@ function(define_ff_vars target)
     MAX_TENSOR_DIM=${FF_MAX_DIM}
     MAX_NUM_TASK_REGIONS=${FF_MAX_NUM_TASK_REGIONS}
     MAX_NUM_TASK_ARGUMENTS=${FF_MAX_NUM_TASK_ARGUMENTS}
+    # _FORTIFY_SOURCE=0
     )
 
   if (FF_GPU_BACKEND STREQUAL "cuda")
@@ -39,7 +40,18 @@ function(ff_set_cxx_properties target)
       CXX_EXTENSIONS NO
   )
   target_compile_options(${target}
-    PRIVATE $<$<COMPILE_LANGUAGE:CXX>:> "-ffile-prefix-map=${CMAKE_SOURCE_DIR}=." # add C++ compile flags here
+    PUBLIC 
+    $<$<COMPILE_LANGUAGE:CXX>:> 
+    "-ffile-prefix-map=${CMAKE_SOURCE_DIR}=." 
+    "-fsanitize=undefined" 
+    "-fno-sanitize-recover=all"
+    # add C++ compile flags here
+  )
+  target_link_options(${target}
+    PUBLIC 
+    $<$<COMPILE_LANGUAGE:CXX>:> 
+    "-fsanitize=undefined" 
+    "-fno-sanitize-recover=all"
   )
 endfunction()
 
diff --git a/flake.nix b/flake.nix
index 91651bd0c1..e4644ef727 100644
--- a/flake.nix
+++ b/flake.nix
@@ -38,9 +38,15 @@
       };
       lib = pkgs.lib;
 
-      mkShell = pkgs.mkShell.override {
+      mkShell = attrs: pkgs.mkShell.override {
         stdenv = pkgs.cudaPackages.backendStdenv;
-      };
+      } (attrs // {
+        hardeningDisable = ["all"]; # disable nixpkgs default compiler arguments, otherwise ubsan doesn't catch 
+                                    # signed overflows due to the signedoverflow hardening setting. 
+                                    # for more details, see the following (long-running) nixpkgs github issues: 
+                                    # - https://github.com/NixOS/nixpkgs/issues/18995
+                                    # - https://github.com/NixOS/nixpkgs/issues/60919
+      });
 
       proj = proj-repo.packages.${system}.proj;
     in 
@@ -121,6 +127,8 @@
 
         gpu-ci = mkShell {
           inputsFrom = [ ci ];
+          hardeningDisable = [ "all" ];
+
           buildInputs = builtins.concatLists [
             (with nixGL.packages.${system}; [
               nixGLDefault
@@ -135,6 +143,8 @@
             "${proj-repo.packages.${system}.proj-nvim}"
           ];
 
+          hardeningDisable = [ "all" ];
+
           buildInputs = builtins.concatLists [
             (with pkgs; [
               clang-tools
diff --git a/lib/compiler/include/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.h b/lib/compiler/include/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.h
index fdc80a1e37..8a7c467303 100644
--- a/lib/compiler/include/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.h
+++ b/lib/compiler/include/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.h
@@ -36,8 +36,9 @@ bool is_right_associative(ComputationGraphBinarySPDecomposition const &);
 std::unordered_multiset<layer_guid_t>
     get_layers(ComputationGraphBinarySPDecomposition const &);
 
-V1BinarySPDecomposition to_v1(ComputationGraphBinarySPDecomposition const &,
-                              bidict<int, layer_guid_t> const &layer_numbering);
+V1BinarySPDecomposition
+    to_v1(ComputationGraphBinarySPDecomposition const &,
+          bidict<nonnegative_int, layer_guid_t> const &layer_numbering);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/src/compiler/allowed_machine_views.cc b/lib/compiler/src/compiler/allowed_machine_views.cc
index db7477b460..6f86d1d82a 100644
--- a/lib/compiler/src/compiler/allowed_machine_views.cc
+++ b/lib/compiler/src/compiler/allowed_machine_views.cc
@@ -11,12 +11,15 @@
 #include "utils/containers/map_from_keys_and_values.h"
 #include "utils/containers/product.h"
 #include "utils/containers/range.h"
-#include "utils/containers/replicate.h"
+#include "utils/containers/repeat_element.h"
 #include "utils/containers/sorted.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/unordered_multiset_of.h"
 #include "utils/containers/unordered_set_of.h"
 #include "utils/containers/zip.h"
+#include "utils/nonnegative_int/ceildiv.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/nonnegative_int/num_elements.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
@@ -47,24 +50,29 @@ static std::unordered_set<MachineView>
                                 OperatorTaskSpace const &task,
                                 DeviceType const &device_type) {
 
-  auto get_max_stride_upper_bound = [](std::vector<int> const &tensor_dims,
-                                       int total_devices) -> int {
-    int min_num_devices_with_full_stride_volume = product(transform(
-        tensor_dims, [](int const &num_devices) { return num_devices - 1; }));
-    return std::ceil(total_devices / min_num_devices_with_full_stride_volume);
+  auto get_max_stride_upper_bound =
+      [](std::vector<nonnegative_int> const &tensor_dims,
+         nonnegative_int total_devices) -> nonnegative_int {
+    nonnegative_int min_num_devices_with_full_stride_volume =
+        product(transform(tensor_dims, [](nonnegative_int num_devices) {
+          return nonnegative_int{num_devices.unwrap_nonnegative() - 1};
+        }));
+    return ceildiv(total_devices, min_num_devices_with_full_stride_volume);
   };
 
-  auto candidate_strides = [&](std::vector<int> const &tensor_dims,
-                               int total_devices)
+  auto candidate_strides = [&](std::vector<nonnegative_int> const &tensor_dims,
+                               nonnegative_int total_devices)
       -> std::unordered_multiset<MultiDimensionalStride> {
-    int max_stride_upper_bound =
+    nonnegative_int max_stride_upper_bound =
         get_max_stride_upper_bound(tensor_dims, total_devices);
 
     std::vector<stride_t> single_stride_range =
-        transform(range(1, max_stride_upper_bound + 1),
-                  [](int stride) { return stride_t{stride}; });
+        transform(nonnegative_range(1_n, max_stride_upper_bound + 1_n),
+                  [](nonnegative_int stride) { return stride_t{stride}; });
     std::unordered_multiset<std::vector<stride_t>> raw_stride_vectors =
-        cartesian_product(replicate(tensor_dims.size(), single_stride_range));
+        cartesian_product(
+            repeat_element(/*num_times=*/num_elements(tensor_dims),
+                           /*element=*/single_stride_range));
     std::unordered_multiset<MultiDimensionalStride> strides =
         transform(raw_stride_vectors, [](auto const &stride_vec) {
           return MultiDimensionalStride{stride_vec};
@@ -75,8 +83,9 @@ static std::unordered_set<MachineView>
   auto candidate_starts = [](MachineSpecification const &ms,
                              DeviceType const &device_type) {
     std::unordered_set<MachineSpaceCoordinate> result;
-    for (int node_idx : range(ms.num_nodes)) {
-      for (int device_idx : range(get_num_devices_per_node(ms, device_type))) {
+    for (nonnegative_int node_idx : nonnegative_range(ms.num_nodes)) {
+      for (nonnegative_int device_idx :
+           nonnegative_range(get_num_devices_per_node(ms, device_type))) {
         result.insert(
             MachineSpaceCoordinate{node_idx, device_idx, device_type});
       }
@@ -91,8 +100,8 @@ static std::unordered_set<MachineView>
     return get_all_permutations_with_repetition(options, num_dims(task));
   };
 
-  std::vector<int> tensor_dims = task.degrees;
-  int total_devices = get_num_devices(machine_spec, device_type);
+  std::vector<nonnegative_int> tensor_dims = task.degrees;
+  nonnegative_int total_devices = get_num_devices(machine_spec, device_type);
 
   std::unordered_set<MachineView> machine_views;
 
diff --git a/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc b/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc
index 5126d9687e..bb9d54f1e9 100644
--- a/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc
+++ b/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc
@@ -11,8 +11,9 @@ std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
   for (int i = 1; i < resource.num_nodes; i *= 2) {
     MachineSpecification sub_resource1 = resource;
     MachineSpecification sub_resource2 = resource;
-    sub_resource1.num_nodes = i;
-    sub_resource2.num_nodes = resource.num_nodes - i;
+    sub_resource1.num_nodes = nonnegative_int{i};
+    sub_resource2.num_nodes =
+        nonnegative_int{resource.num_nodes.unwrap_nonnegative() - i};
     result.insert(std::make_pair(sub_resource1, sub_resource2));
     result.insert(std::make_pair(sub_resource2, sub_resource1));
   }
@@ -20,8 +21,9 @@ std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
   for (int i = 1; i < resource.num_gpus_per_node; i *= 2) {
     MachineSpecification sub_resource1 = resource;
     MachineSpecification sub_resource2 = resource;
-    sub_resource1.num_gpus_per_node = i;
-    sub_resource2.num_gpus_per_node = resource.num_gpus_per_node - i;
+    sub_resource1.num_gpus_per_node = nonnegative_int{i};
+    sub_resource2.num_gpus_per_node =
+        nonnegative_int{resource.num_gpus_per_node.unwrap_nonnegative() - i};
     result.insert(std::make_pair(sub_resource1, sub_resource2));
     result.insert(std::make_pair(sub_resource2, sub_resource1));
   }
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
index fc3a58995c..82c8274808 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
@@ -1,20 +1,14 @@
 #include "compiler/machine_mapping/machine_mapping.h"
-#include "pcg/machine_specification.h"
-#include "pcg/machine_view.h"
-#include "pcg/operator_task_space.dtg.h"
-#include "pcg/operator_task_space.h"
-#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "utils/containers/are_disjoint.h"
-#include "utils/containers/get_one_of.h"
 #include "utils/containers/keys.h"
-#include "utils/containers/map_values.h"
 #include "utils/containers/merge_maps.h"
 
 namespace FlexFlow {
 
 MachineMapping combine_disjoint_mappings(MachineMapping const &m1,
                                          MachineMapping const &m2) {
-  return MachineMapping{merge_maps(m1.machine_views, m2.machine_views)};
+  return MachineMapping{
+      merge_disjoint_maps(m1.machine_views, m2.machine_views)};
 }
 
 bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2) {
diff --git a/lib/compiler/src/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.cc
index 715a4c2e3d..ed60004bf4 100644
--- a/lib/compiler/src/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.cc
@@ -10,8 +10,8 @@ ParallelLayerGuidObliviousMachineMapping binary_combine_mappings(
     ParallelLayerGuidObliviousMachineMapping const &lhs,
     ParallelLayerGuidObliviousMachineMapping const &rhs) {
   return ParallelLayerGuidObliviousMachineMapping{
-      merge_maps(map_keys(lhs.raw_mapping, nest_inside_left_child),
-                 map_keys(rhs.raw_mapping, nest_inside_right_child)),
+      merge_disjoint_maps(map_keys(lhs.raw_mapping, nest_inside_left_child),
+                          map_keys(rhs.raw_mapping, nest_inside_right_child)),
   };
 }
 
diff --git a/lib/compiler/src/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.cc b/lib/compiler/src/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.cc
index 32fb53b58a..9886468386 100644
--- a/lib/compiler/src/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.cc
+++ b/lib/compiler/src/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.cc
@@ -164,7 +164,7 @@ std::unordered_multiset<layer_guid_t>
 
 V1BinarySPDecomposition
     to_v1(ComputationGraphBinarySPDecomposition const &tree,
-          bidict<int, layer_guid_t> const &layer_numbering) {
+          bidict<nonnegative_int, layer_guid_t> const &layer_numbering) {
   return tree.visit<V1BinarySPDecomposition>(
       overload{[&](ComputationGraphBinarySeriesSplit const &series) {
                  return V1BinarySPDecomposition{
diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/allowed_machine_views.cc
index 936894ad2d..817cc80700 100644
--- a/lib/compiler/test/src/allowed_machine_views.cc
+++ b/lib/compiler/test/src/allowed_machine_views.cc
@@ -15,39 +15,39 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("1 degree of parallelism") {
       MachineSpecification ms = MachineSpecification{
-          /*num_nodes=*/1,
-          /*num_cpus_per_node=*/5,
-          /*num_gpus_per_node=*/5,
+          /*num_nodes=*/1_n,
+          /*num_cpus_per_node=*/5_n,
+          /*num_gpus_per_node=*/5_n,
           /*inter_node_bandwidth=*/0,
           /*intra_node_bandwidth=*/0,
       };
 
-      OperatorTaskSpace task = OperatorTaskSpace{{3}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
 
       std::unordered_set<MachineView> correct = {
           MachineView{
               MachineSpaceCoordinate{
-                  /*node_idx=*/0, /*device_idx=*/0, DeviceType::GPU},
-              {MachineViewDimension{stride_t{1},
+                  /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{1_n},
                                     MachineSpecificationDimension::INTRA_NODE}},
           },
 
           MachineView{
               MachineSpaceCoordinate{
-                  /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU},
-              {MachineViewDimension{stride_t{1},
+                  /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{1_n},
                                     MachineSpecificationDimension::INTRA_NODE}},
           },
           MachineView{
               MachineSpaceCoordinate{
-                  /*node_idx=*/0, /*device_idx=*/2, DeviceType::GPU},
-              {MachineViewDimension{stride_t{1},
+                  /*node_idx=*/0_n, /*device_idx=*/2_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{1_n},
                                     MachineSpecificationDimension::INTRA_NODE}},
           },
           MachineView{
               MachineSpaceCoordinate{
-                  /*node_idx=*/0, /*device_idx=*/0, DeviceType::GPU},
-              {MachineViewDimension{stride_t{2},
+                  /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{2_n},
                                     MachineSpecificationDimension::INTRA_NODE}},
           },
       };
@@ -61,18 +61,18 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("2 degrees of parallelism") {
 
       MachineSpecification ms = MachineSpecification{
-          /*num_nodes=*/3,
-          /*num_cpus_per_node=*/3,
-          /*num_gpus_per_node=*/3,
+          /*num_nodes=*/3_n,
+          /*num_cpus_per_node=*/3_n,
+          /*num_gpus_per_node=*/3_n,
           /*inter_node_bandwidth=*/0,
           /*intra_node_bandwidth=*/0,
       };
-      OperatorTaskSpace task = OperatorTaskSpace{{2, 3}};
+      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 3_n}};
 
-      auto make_2d_view = [&](int start_node_idx,
-                              int start_device_idx,
-                              int stride1,
-                              int stride2,
+      auto make_2d_view = [&](nonnegative_int start_node_idx,
+                              nonnegative_int start_device_idx,
+                              nonnegative_int stride1,
+                              nonnegative_int stride2,
                               MachineSpecificationDimension m1,
                               MachineSpecificationDimension m2) {
         return MachineView{
@@ -86,13 +86,19 @@ TEST_SUITE(FF_TEST_SUITE) {
       auto intra = MachineSpecificationDimension::INTRA_NODE;
       auto inter = MachineSpecificationDimension::INTER_NODE;
       std::unordered_set<MachineView> correct = {
-          make_2d_view(0, 0, /*stride1=*/1, /*stride2=*/1, inter, intra),
-          make_2d_view(1, 0, /*stride1=*/1, /*stride2=*/1, inter, intra),
-          make_2d_view(0, 0, /*stride1=*/2, /*stride2=*/1, inter, intra),
-
-          make_2d_view(0, 0, /*stride1=*/1, /*stride2=*/1, intra, inter),
-          make_2d_view(0, 1, /*stride1=*/1, /*stride2=*/1, intra, inter),
-          make_2d_view(0, 0, /*stride1=*/2, /*stride2=*/1, intra, inter),
+          make_2d_view(
+              0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra),
+          make_2d_view(
+              1_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra),
+          make_2d_view(
+              0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, inter, intra),
+
+          make_2d_view(
+              0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter),
+          make_2d_view(
+              0_n, 1_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter),
+          make_2d_view(
+              0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, intra, inter),
       };
 
       std::unordered_set<MachineView> result =
diff --git a/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc
index 5c8ea1c0f1..b0d86124a1 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc
@@ -28,12 +28,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{12, 1},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{12_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc b/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc
index 499b111f8f..5f4ba2bfdc 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc
@@ -8,10 +8,11 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_machine_resource_splits") {
-    auto make_machine_spec = [](int num_nodes, int num_gpus_per_node) {
+    auto make_machine_spec = [](nonnegative_int num_nodes,
+                                nonnegative_int num_gpus_per_node) {
       return MachineSpecification{
           /*num_nodes=*/num_nodes,
-          /*num_cpus_per_node=*/1,
+          /*num_cpus_per_node=*/1_n,
           /*num_gpus_per_node=*/num_gpus_per_node,
           /*inter_node_bandwidth=*/1.0,
           /*intra_node_bandwidth=*/1.0,
@@ -19,8 +20,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     SUBCASE("returns no splits if no splits are possible") {
-      MachineSpecification input = make_machine_spec(/*num_nodes=*/1,
-                                                     /*num_gpus_per_node=*/1);
+      MachineSpecification input = make_machine_spec(/*num_nodes=*/1_n,
+                                                     /*num_gpus_per_node=*/1_n);
 
       std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
           result = get_machine_resource_splits(input);
@@ -32,8 +33,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE(
         "returns splits in gpu and node dimensions, but not at the same time") {
-      MachineSpecification input = make_machine_spec(/*num_nodes=*/2,
-                                                     /*num_gpus_per_node=*/2);
+      MachineSpecification input = make_machine_spec(/*num_nodes=*/2_n,
+                                                     /*num_gpus_per_node=*/2_n);
 
       std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
           result = get_machine_resource_splits(input);
@@ -41,16 +42,16 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
           correct = {
               {
-                  make_machine_spec(/*num_nodes=*/2,
-                                    /*num_gpus_per_node=*/1),
-                  make_machine_spec(/*num_nodes=*/2,
-                                    /*num_gpus_per_node=*/1),
+                  make_machine_spec(/*num_nodes=*/2_n,
+                                    /*num_gpus_per_node=*/1_n),
+                  make_machine_spec(/*num_nodes=*/2_n,
+                                    /*num_gpus_per_node=*/1_n),
               },
               {
-                  make_machine_spec(/*num_nodes=*/1,
-                                    /*num_gpus_per_node=*/2),
-                  make_machine_spec(/*num_nodes=*/1,
-                                    /*num_gpus_per_node=*/2),
+                  make_machine_spec(/*num_nodes=*/1_n,
+                                    /*num_gpus_per_node=*/2_n),
+                  make_machine_spec(/*num_nodes=*/1_n,
+                                    /*num_gpus_per_node=*/2_n),
               },
 
           };
@@ -60,8 +61,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("returns splits in node dimension in powers of two") {
       SUBCASE("num_nodes is a power of 2") {
-        MachineSpecification input = make_machine_spec(/*num_nodes=*/8,
-                                                       /*num_gpus_per_node=*/1);
+        MachineSpecification input =
+            make_machine_spec(/*num_nodes=*/8_n,
+                              /*num_gpus_per_node=*/1_n);
 
         std::unordered_set<
             std::pair<MachineSpecification, MachineSpecification>>
@@ -71,34 +73,34 @@ TEST_SUITE(FF_TEST_SUITE) {
             std::pair<MachineSpecification, MachineSpecification>>
             correct = {
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/7,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/7_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/2,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/6,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/2_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/6_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/4,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/4,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/4_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/4_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/6,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/2,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/6_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/2_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/7,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/7_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
             };
 
@@ -106,8 +108,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("num_nodes is not a power of 2") {
-        MachineSpecification input = make_machine_spec(/*num_nodes=*/6,
-                                                       /*num_gpus_per_node=*/1);
+        MachineSpecification input =
+            make_machine_spec(/*num_nodes=*/6_n,
+                              /*num_gpus_per_node=*/1_n);
 
         std::unordered_set<
             std::pair<MachineSpecification, MachineSpecification>>
@@ -117,28 +120,28 @@ TEST_SUITE(FF_TEST_SUITE) {
             std::pair<MachineSpecification, MachineSpecification>>
             correct = {
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/5,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/5_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/2,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/4,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/2_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/4_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/4,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/2,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/4_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/2_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/5,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/5_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
             };
 
@@ -148,8 +151,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("returns splits in gpu dimension in powers of two") {
       SUBCASE("num_gpus_per_node is a power of 2") {
-        MachineSpecification input = make_machine_spec(/*num_nodes=*/1,
-                                                       /*num_gpus_per_node=*/8);
+        MachineSpecification input =
+            make_machine_spec(/*num_nodes=*/1_n,
+                              /*num_gpus_per_node=*/8_n);
 
         std::unordered_set<
             std::pair<MachineSpecification, MachineSpecification>>
@@ -159,34 +163,34 @@ TEST_SUITE(FF_TEST_SUITE) {
             std::pair<MachineSpecification, MachineSpecification>>
             correct = {
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/7),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/7_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/2),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/6),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/2_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/6_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/4),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/4),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/4_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/4_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/6),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/2),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/6_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/2_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/7),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/7_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
             };
 
@@ -194,8 +198,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("num_gpus_per_node is not a power of 2") {
-        MachineSpecification input = make_machine_spec(/*num_nodes=*/1,
-                                                       /*num_gpus_per_node=*/6);
+        MachineSpecification input =
+            make_machine_spec(/*num_nodes=*/1_n,
+                              /*num_gpus_per_node=*/6_n);
 
         std::unordered_set<
             std::pair<MachineSpecification, MachineSpecification>>
@@ -205,28 +210,28 @@ TEST_SUITE(FF_TEST_SUITE) {
             std::pair<MachineSpecification, MachineSpecification>>
             correct = {
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/5),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/5_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/2),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/4),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/2_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/4_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/4),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/2),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/4_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/2_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/5),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/5_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
             };
       }
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index 542edd9fa9..c5b891781d 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -45,14 +45,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView mv1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -60,31 +60,31 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView mv2 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
     };
 
     MachineSpecification full_machine_spec = MachineSpecification{
-        /*num_nodes=*/2,
-        /*num_cpus_per_node=*/1,
-        /*num_gpus_per_node=*/1,
+        /*num_nodes=*/2_n,
+        /*num_cpus_per_node=*/1_n,
+        /*num_gpus_per_node=*/1_n,
         /*inter_node_bandwidth=*/1,
         /*intra_node_bandwidth=*/1,
     };
 
     MachineSpecification split_machine_spec = MachineSpecification{
-        /*num_nodes=*/1,
-        /*num_cpus_per_node=*/1,
-        /*num_gpus_per_node=*/1,
+        /*num_nodes=*/1_n,
+        /*num_cpus_per_node=*/1_n,
+        /*num_gpus_per_node=*/1_n,
         /*inter_node_bandwidth=*/1,
         /*intra_node_bandwidth=*/1,
     };
@@ -121,8 +121,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{},
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
index 52ad82595d..642fdf7ae1 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
@@ -30,12 +30,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{12, 1},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{12_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -66,14 +66,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView pre_mv1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -81,14 +81,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView pre_mv2 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -96,14 +96,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView post_mv1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{3},
+                stride_t{3_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -111,14 +111,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView post_mv2 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{4},
+                stride_t{4_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
index 304034f9be..e88b714bd4 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
@@ -9,14 +9,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("combine_disjoint_mappings(MachineMapping, MachineMappping)") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -24,14 +24,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -55,14 +55,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("nodes_are_disjoint(MachineMapping, MachineMappping)") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -70,14 +70,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
index 06ab1e5b8c..a8ec24de63 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
@@ -65,11 +65,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 1},
+                ShardParallelDim{10_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
index 73b921fc98..4a261bcdae 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
@@ -8,14 +8,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("series_combine") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -23,14 +23,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -189,14 +189,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("parallel_combine") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -204,14 +204,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -312,14 +312,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("minimize_runtime") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -327,14 +327,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index 8612017705..313f24c384 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -45,14 +45,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView mv1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -60,31 +60,31 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView mv2 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
     };
 
     MachineSpecification full_machine_spec = MachineSpecification{
-        /*num_nodes=*/2,
-        /*num_cpus_per_node=*/1,
-        /*num_gpus_per_node=*/1,
+        /*num_nodes=*/2_n,
+        /*num_cpus_per_node=*/1_n,
+        /*num_gpus_per_node=*/1_n,
         /*inter_node_bandwidth=*/1,
         /*intra_node_bandwidth=*/1,
     };
 
     MachineSpecification split_machine_spec = MachineSpecification{
-        /*num_nodes=*/1,
-        /*num_cpus_per_node=*/1,
-        /*num_gpus_per_node=*/1,
+        /*num_nodes=*/1_n,
+        /*num_cpus_per_node=*/1_n,
+        /*num_gpus_per_node=*/1_n,
         /*inter_node_bandwidth=*/1,
         /*intra_node_bandwidth=*/1,
     };
@@ -121,8 +121,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{},
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
index 1f3b7545a8..04149cae8f 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
@@ -9,14 +9,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("remove_non_pareto_optimal_machine_mapping_result") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -24,14 +24,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -39,14 +39,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_2 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{4},
+                stride_t{4_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -55,19 +55,19 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpCostMetrics cost1 = OpCostMetrics{
         /*forward_runtime=*/2.0,
         /*backward_runtime=*/2.0,
-        /*memory=*/nonnegative_int{2},
+        /*memory=*/2_n,
     };
 
     OpCostMetrics cost2 = OpCostMetrics{
         /*forward_runtime=*/4.0,
         /*backward_runtime=*/4.0,
-        /*memory=*/nonnegative_int{1},
+        /*memory=*/1_n,
     };
 
     OpCostMetrics cost3 = OpCostMetrics{
         /*forward_runtime=*/2.0,
         /*backward_runtime=*/2.0,
-        /*memory=*/nonnegative_int{3},
+        /*memory=*/3_n,
     };
 
     MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{
@@ -159,14 +159,14 @@ TEST_SUITE(FF_TEST_SUITE) {
             "std::optional<ParallelSplitTransformation> const&)") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -174,14 +174,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -190,7 +190,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpCostMetrics pre_cost = OpCostMetrics{
         /*forward_runtime=*/2.0,
         /*backward_runtime=*/2.0,
-        /*memory=*/nonnegative_int{2},
+        /*memory=*/2_n,
     };
     MachineMappingWithMemoryResult pre = MachineMappingWithMemoryResult{{
         MachineMappingForSingleLayer{
@@ -217,7 +217,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpCostMetrics post_cost = OpCostMetrics{
         /*forward_runtime=*/4.0,
         /*backward_runtime=*/4.0,
-        /*memory=*/nonnegative_int{1},
+        /*memory=*/1_n,
     };
 
     MachineMappingWithMemoryResult post = MachineMappingWithMemoryResult{{
@@ -360,14 +360,14 @@ TEST_SUITE(FF_TEST_SUITE) {
             "std::optional<ParallelSplitTransformation> const&)") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -375,14 +375,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -391,7 +391,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpCostMetrics lhs_cost = OpCostMetrics{
         /*forward_runtime=*/2.0,
         /*backward_runtime=*/2.0,
-        /*memory=*/nonnegative_int{2},
+        /*memory=*/2_n,
     };
     MachineMappingWithMemoryResult lhs = MachineMappingWithMemoryResult{{
         MachineMappingForSingleLayer{
@@ -418,7 +418,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpCostMetrics rhs_cost = OpCostMetrics{
         /*forward_runtime=*/4.0,
         /*backward_runtime=*/4.0,
-        /*memory=*/nonnegative_int{1},
+        /*memory=*/1_n,
     };
     MachineMappingWithMemoryResult rhs = MachineMappingWithMemoryResult{{
         MachineMappingForSingleLayer{
@@ -492,14 +492,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("minimize_runtime(memory)") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -507,14 +507,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -522,14 +522,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_2 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{4},
+                stride_t{4_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -538,17 +538,17 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpCostMetrics cost1 = OpCostMetrics{
         /*forward_runtime=*/2.0,
         /*backward_runtime=*/2.0,
-        /*memory=*/nonnegative_int{2},
+        /*memory=*/2_n,
     };
     OpCostMetrics cost2 = OpCostMetrics{
         /*forward_runtime=*/4.0,
         /*backward_runtime=*/4.0,
-        /*memory=*/nonnegative_int{1},
+        /*memory=*/1_n,
     };
     OpCostMetrics cost3 = OpCostMetrics{
         /*forward_runtime=*/2.0,
         /*backward_runtime=*/2.0,
-        /*memory=*/nonnegative_int{3},
+        /*memory=*/3_n,
     };
 
     MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{
diff --git a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
index 2b59669aad..d0f289043c 100644
--- a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
+++ b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
@@ -29,11 +29,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraph cg = [&] {
         ComputationGraphBuilder b;
 
-        TensorShape input_shape = TensorShape{TensorDims{FFOrdered<size_t>{
-                                                  10,
-                                                  12,
-                                              }},
-                                              DataType::FLOAT};
+        TensorShape input_shape =
+            TensorShape{TensorDims{FFOrdered<nonnegative_int>{
+                            10_n,
+                            12_n,
+                        }},
+                        DataType::FLOAT};
         b.create_input(input_shape, CreateGrad::YES, input_layer_name);
 
         return b.computation_graph;
@@ -57,16 +58,17 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraph cg = [&] {
         ComputationGraphBuilder b;
 
-        TensorShape input_shape = TensorShape{TensorDims{FFOrdered<size_t>{
-                                                  10,
-                                                  12,
-                                              }},
-                                              DataType::FLOAT};
+        TensorShape input_shape =
+            TensorShape{TensorDims{FFOrdered<nonnegative_int>{
+                            10_n,
+                            12_n,
+                        }},
+                        DataType::FLOAT};
         tensor_guid_t input =
             b.create_input(input_shape, CreateGrad::YES, input_layer_name);
 
         b.dense(input,
-                /*outDim=*/14,
+                /*outDim=*/14_n,
                 /*activation=*/std::nullopt,
                 /*use_bias=*/true,
                 /*data_type=*/DataType::FLOAT,
@@ -119,9 +121,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<size_t>{
-                10,
-                12,
+            TensorDims{FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
             }},
             DataType::FLOAT,
         };
@@ -129,7 +131,7 @@ TEST_SUITE(FF_TEST_SUITE) {
             b.create_input(input_shape, CreateGrad::YES, input_name);
 
         b.dense(input,
-                /*outDim=*/14,
+                /*outDim=*/14_n,
                 /*activation=*/std::nullopt,
                 /*use_bias=*/false,
                 /*data_type=*/DataType::FLOAT,
@@ -138,7 +140,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                 /*name=*/op1_name,
                 /*projection_name=*/w1_name);
         b.dense(input,
-                /*outDim=*/14,
+                /*outDim=*/14_n,
                 /*activation=*/std::nullopt,
                 /*use_bias=*/false,
                 /*data_type=*/DataType::FLOAT,
@@ -189,9 +191,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<size_t>{
-                10,
-                12,
+            TensorDims{FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
             }},
             DataType::FLOAT,
         };
@@ -246,9 +248,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<size_t>{
-                10,
-                12,
+            TensorDims{FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
             }},
             DataType::FLOAT,
         };
@@ -277,7 +279,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("real models") {
       SUBCASE("split_test") {
         ComputationGraph cg =
-            get_split_test_computation_graph(/*batch_size=*/8);
+            get_split_test_computation_graph(/*batch_size=*/8_n);
 
         std::optional<SeriesParallelDecomposition> sp_decomposition =
             get_computation_graph_series_parallel_decomposition(cg);
@@ -339,14 +341,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraph cg = [&] {
         ComputationGraphBuilder b;
 
-        TensorShape input_shape = TensorShape{TensorDims{FFOrdered<size_t>{
-                                                  10,
-                                                  12,
-                                              }},
-                                              DataType::FLOAT};
+        TensorShape input_shape =
+            TensorShape{TensorDims{FFOrdered<nonnegative_int>{
+                            10_n,
+                            12_n,
+                        }},
+                        DataType::FLOAT};
         tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES);
 
-        b.dense(input, /*outDim=*/14);
+        b.dense(input, /*outDim=*/14_n);
 
         return b.computation_graph;
       }();
@@ -356,7 +359,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("split_test") {
-      ComputationGraph cg = get_split_test_computation_graph(/*batch_size=*/8);
+      ComputationGraph cg =
+          get_split_test_computation_graph(/*batch_size=*/8_n);
 
       std::string result =
           render_preprocessed_computation_graph_for_sp_decomposition(cg);
diff --git a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
index e278338440..d262539dc1 100644
--- a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
+++ b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
@@ -38,9 +38,9 @@ namespace FlexFlow {
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("task_simulator_estimate_forward_pass_time") {
     MachineSpecification machine_spec =
-        MachineSpecification{/*num_nodes=*/3,
-                             /*num_cpus_per_node=*/3,
-                             /*num_gpus_per_node=*/3,
+        MachineSpecification{/*num_nodes=*/3_n,
+                             /*num_cpus_per_node=*/3_n,
+                             /*num_gpus_per_node=*/3_n,
                              /*inter_node_bandwidth=*/1.0f,
                              /*intra_node_bandwidth=*/1.0f};
 
@@ -50,8 +50,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{},
               ReplicaParallelDimSet{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
               },
           },
           DataType::FLOAT,
@@ -63,16 +63,16 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_layer_guid_t layer1 = get_source_layer(tensor1);
 
       std::vector<MachineViewDimension> dims = {
-          MachineViewDimension{stride_t{1},
+          MachineViewDimension{stride_t{1_n},
                                MachineSpecificationDimension::INTER_NODE},
-          MachineViewDimension{stride_t{1},
+          MachineViewDimension{stride_t{1_n},
                                MachineSpecificationDimension::INTER_NODE},
       };
       ParallelComputationGraph pcg = b.pcg;
       MachineView mv1 =
-          MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims};
+          MachineView{MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU}, dims};
       MachineView mv2 =
-          MachineView{MachineSpaceCoordinate{0, 1, DeviceType::GPU}, dims};
+          MachineView{MachineSpaceCoordinate{0_n, 1_n, DeviceType::GPU}, dims};
 
       MachineMapping device_mapping = MachineMapping{{
           {layer0, mv1},
@@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) {
             /*forward_op_cost=*/10.0f,
             /*backward_op_cost=*/10.0f,
             /*comm_cost=*/1.0f,
-            /*memory_cost=*/nonnegative_int{0});
+            /*memory_cost=*/0_n);
 
         float result = task_simulator_estimate_forward_pass_time(
             pcg, estimator, device_mapping, machine_spec);
@@ -99,16 +99,16 @@ TEST_SUITE(FF_TEST_SUITE) {
               if (op.op_attrs.has<InputAttrs>()) {
                 return OpCostMetrics{/*forward_runtime=*/10.0f,
                                      /*backward_runtime=*/10.0f,
-                                     /*memory=*/nonnegative_int{0}}; // layer0
+                                     /*memory=*/0_n}; // layer0
               }
               if (op.op_attrs.has<ElementUnaryAttrs>()) {
                 return OpCostMetrics{/*forward_runtime=*/1.0f,
                                      /*backward_runtime=*/1.0f,
-                                     /*memory=*/nonnegative_int{0}}; // layer1
+                                     /*memory=*/0_n}; // layer1
               }
               return OpCostMetrics{/*forward_runtime=*/0.0f,
                                    /*backward_runtime=*/0.0f,
-                                   /*memory=*/nonnegative_int{0}};
+                                   /*memory=*/0_n};
             },
             [](TensorSetMovement const &comm) { return 5.0f; });
 
@@ -124,10 +124,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       ParallelTensorShape input_shape = ParallelTensorShape{
           ParallelTensorDims{
-              FFOrdered<ShardParallelDim>{ShardParallelDim{10, 1}},
+              FFOrdered<ShardParallelDim>{ShardParallelDim{10_n, 1_n}},
               ReplicaParallelDimSet{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
               },
           },
           DataType::FLOAT,
@@ -145,23 +145,23 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       ParallelComputationGraph pcg = b.pcg;
       std::vector<MachineViewDimension> dims = {
-          MachineViewDimension{stride_t{1},
+          MachineViewDimension{stride_t{1_n},
                                MachineSpecificationDimension::INTER_NODE},
-          MachineViewDimension{stride_t{1},
+          MachineViewDimension{stride_t{1_n},
                                MachineSpecificationDimension::INTER_NODE},
-          MachineViewDimension{stride_t{1},
+          MachineViewDimension{stride_t{1_n},
                                MachineSpecificationDimension::INTER_NODE},
       };
 
       SUBCASE("all different devices") {
-        MachineView mv0 =
-            MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims};
-        MachineView mv1 =
-            MachineView{MachineSpaceCoordinate{0, 1, DeviceType::GPU}, dims};
-        MachineView mv2 =
-            MachineView{MachineSpaceCoordinate{1, 0, DeviceType::GPU}, dims};
-        MachineView mv3 =
-            MachineView{MachineSpaceCoordinate{1, 1, DeviceType::GPU}, dims};
+        MachineView mv0 = MachineView{
+            MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU}, dims};
+        MachineView mv1 = MachineView{
+            MachineSpaceCoordinate{0_n, 1_n, DeviceType::GPU}, dims};
+        MachineView mv2 = MachineView{
+            MachineSpaceCoordinate{1_n, 0_n, DeviceType::GPU}, dims};
+        MachineView mv3 = MachineView{
+            MachineSpaceCoordinate{1_n, 1_n, DeviceType::GPU}, dims};
 
         MachineMapping device_mapping = MachineMapping{{
             {layer0, mv0},
@@ -174,7 +174,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               /*forward_op_cost=*/10.0f,
               /*backward_op_cost=*/10.0f,
               /*comm_cost=*/1.0f,
-              /*memory_cost=*/nonnegative_int{0});
+              /*memory_cost=*/0_n);
 
           float result = task_simulator_estimate_forward_pass_time(
               pcg, estimator, device_mapping, machine_spec);
@@ -187,30 +187,29 @@ TEST_SUITE(FF_TEST_SUITE) {
                 if (op.op_attrs.has<InputAttrs>()) {
                   return OpCostMetrics{/*forward_runtime=*/10.0f,
                                        /*backward_runtime=*/10.0f,
-                                       /*memory=*/nonnegative_int{0}}; // layer0
+                                       /*memory=*/0_n}; // layer0
                 }
                 if (op.op_attrs.has<ElementUnaryAttrs>()) {
-                  return OpCostMetrics{
-                      /*forward_runtime=*/1.0f,
-                      /*backward_runtime=*/1.0f,
-                      /*memory=*/nonnegative_int{0}}; // layers 1, 2
+                  return OpCostMetrics{/*forward_runtime=*/1.0f,
+                                       /*backward_runtime=*/1.0f,
+                                       /*memory=*/0_n}; // layers 1, 2
                 }
                 if (op.op_attrs.has<ElementBinaryAttrs>()) {
                   return OpCostMetrics{/*forward_runtime=*/2.0f,
                                        /*backward_runtime=*/2.0f,
-                                       /*memory=*/nonnegative_int{0}}; // layer3
+                                       /*memory=*/0_n}; // layer3
                 }
                 return OpCostMetrics{/*forward_runtime=*/0.0f,
                                      /*backward_runtime=*/0.0f,
-                                     /*memory=*/nonnegative_int{0}};
+                                     /*memory=*/0_n};
               },
               [](TensorSetMovement const &comm) { return 5.0f; });
         }
       }
 
       SUBCASE("all the same device") {
-        MachineView mv =
-            MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims};
+        MachineView mv = MachineView{
+            MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU}, dims};
         MachineMapping device_mapping = MachineMapping{{
             {layer0, mv},
             {layer1, mv},
@@ -222,7 +221,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               /*forward_op_cost=*/10.0f,
               /*backward_op_cost=*/10.0f,
               /*comm_cost=*/1.0f,
-              /*memory_cost=*/nonnegative_int{0});
+              /*memory_cost=*/0_n);
 
           float result = task_simulator_estimate_forward_pass_time(
               pcg, cost_estimator, device_mapping, machine_spec);
@@ -235,22 +234,21 @@ TEST_SUITE(FF_TEST_SUITE) {
                 if (op.op_attrs.has<InputAttrs>()) {
                   return OpCostMetrics{/*forward_runtime=*/10.0f,
                                        /*backward_runtime=*/10.0f,
-                                       /*memory=*/nonnegative_int{0}}; // layer0
+                                       /*memory=*/0_n}; // layer0
                 }
                 if (op.op_attrs.has<ElementUnaryAttrs>()) {
-                  return OpCostMetrics{
-                      /*forward_runtime=*/1.0f,
-                      /*backward_runtime=*/1.0f,
-                      /*memory=*/nonnegative_int{0}}; // layers 1, 2
+                  return OpCostMetrics{/*forward_runtime=*/1.0f,
+                                       /*backward_runtime=*/1.0f,
+                                       /*memory=*/0_n}; // layers 1, 2
                 }
                 if (op.op_attrs.has<ElementBinaryAttrs>()) {
                   return OpCostMetrics{/*forward_runtime=*/2.0f,
                                        /*backward_runtime=*/2.0f,
-                                       /*memory=*/nonnegative_int{0}}; // layer3
+                                       /*memory=*/0_n}; // layer3
                 }
                 return OpCostMetrics{/*forward_runtime=*/0.0f,
                                      /*backward_runtime=*/0.0f,
-                                     /*memory=*/nonnegative_int{0}};
+                                     /*memory=*/0_n};
               },
               [](TensorSetMovement const &comm) { return 5.0f; });
           float result = task_simulator_estimate_forward_pass_time(
diff --git a/lib/compiler/test/src/graph_optimize_state.cc b/lib/compiler/test/src/graph_optimize_state.cc
index 46177ad420..0fd9e245a6 100644
--- a/lib/compiler/test/src/graph_optimize_state.cc
+++ b/lib/compiler/test/src/graph_optimize_state.cc
@@ -11,35 +11,37 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape =
         ParallelTensorShape{ParallelTensorDims{
                                 FFOrdered<ShardParallelDim>{
-                                    ShardParallelDim{32, 2},
-                                    ShardParallelDim{16, 1},
+                                    ShardParallelDim{32_n, 2_n},
+                                    ShardParallelDim{16_n, 1_n},
                                 },
                                 ReplicaParallelDimSet{
-                                    SumDegree{1},
-                                    DiscardCopyDegree{1},
+                                    SumDegree{1_n},
+                                    DiscardCopyDegree{1_n},
                                 },
                             },
                             DataType::FLOAT};
 
     parallel_tensor_guid_t input0 =
         builder.create_input_tensor(input_shape, CreateGrad::YES, "input0");
-    parallel_tensor_guid_t dense0 = builder.dense(input0,
-                                                  8,
-                                                  Activation::RELU,
-                                                  true,
-                                                  DataType::FLOAT,
-                                                  std::nullopt,
-                                                  std::nullopt,
-                                                  "dense0");
+    parallel_tensor_guid_t dense0 =
+        builder.dense(/*input=*/input0,
+                      /*outDim=*/8_n,
+                      /*activation=*/Activation::RELU,
+                      /*use_bias=*/true,
+                      /*data_type=*/DataType::FLOAT,
+                      /*projection_initializer=*/std::nullopt,
+                      /*bias_initializer=*/std::nullopt,
+                      /*name=*/"dense0");
 
-    parallel_tensor_guid_t dense1 = builder.dense(dense0,
-                                                  4,
-                                                  Activation::RELU,
-                                                  true,
-                                                  DataType::FLOAT,
-                                                  std::nullopt,
-                                                  std::nullopt,
-                                                  "dense1");
+    parallel_tensor_guid_t dense1 =
+        builder.dense(/*input=*/dense0,
+                      /*outDim=*/4_n,
+                      /*activation=*/Activation::RELU,
+                      /*use_bias=*/true,
+                      /*data_type=*/DataType::FLOAT,
+                      /*projection_initializer=*/std::nullopt,
+                      /*bias_initializer=*/std::nullopt,
+                      /*name=*/"dense1");
 
     ParallelComputationGraph pcg = builder.pcg;
 
@@ -59,14 +61,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     parallel_tensor_guid_t input0_ =
         builder.create_input_tensor(input_shape, CreateGrad::YES, "input0");
-    parallel_tensor_guid_t dense0_ = builder.dense(input0,
-                                                   8,
-                                                   Activation::RELU,
-                                                   true,
-                                                   DataType::FLOAT,
-                                                   std::nullopt,
-                                                   std::nullopt,
-                                                   "dense0");
+    parallel_tensor_guid_t dense0_ =
+        builder.dense(/*input=*/input0,
+                      /*outDim=*/8_n,
+                      /*activation=*/Activation::RELU,
+                      /*use_bias=*/true,
+                      /*data_type=*/DataType::FLOAT,
+                      /*projection_initializer=*/std::nullopt,
+                      /*bias_initializer=*/std::nullopt,
+                      /*name=*/"dense0");
 
     ParallelComputationGraph pcg_ = builder.pcg;
 
diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 326c6922f9..57498ee466 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -1,8 +1,9 @@
 #ifndef _FLEXFLOW_KERNELS_ARRAY_SHAPE_H
 #define _FLEXFLOW_KERNELS_ARRAY_SHAPE_H
 
-#include "legion_dim.h"
+#include "kernels/legion_dim.h"
 #include "op-attrs/tensor_shape.dtg.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include "utils/stack_vector/stack_vector.h"
 #include "utils/visitable.h"
 #include <cstddef>
@@ -14,44 +15,49 @@ namespace FlexFlow {
 struct ArrayShape {
 public:
   ArrayShape() = delete;
-  ArrayShape(size_t *dims, size_t num_dims);
+  ArrayShape(nonnegative_int *dims, nonnegative_int num_dims);
   ArrayShape(TensorShape const &shape);
-  ArrayShape(std::vector<std::size_t> const &);
+  ArrayShape(std::vector<nonnegative_int> const &);
 
   /**
    * @brief Alias of ArrayShape::num_elements for compatibility with
    * Legion::Domain
    */
-  std::size_t get_volume() const;
+  nonnegative_int get_volume() const;
 
   /**
    * @brief Alias of ArrayShape::num_dims for compatibility with Legion::Domain
    */
-  std::size_t get_dim() const;
+  nonnegative_int get_dim() const;
 
-  std::size_t num_elements() const;
-  std::size_t num_dims() const;
+  nonnegative_int num_elements() const;
+  nonnegative_int num_dims() const;
 
-  std::size_t operator[](legion_dim_t) const;
-  std::size_t at(legion_dim_t) const;
-  std::size_t at(ff_dim_t) const;
+  nonnegative_int operator[](legion_dim_t) const;
+  nonnegative_int at(legion_dim_t) const;
+  nonnegative_int at(ff_dim_t) const;
+
+  bool operator==(ArrayShape const &) const;
+  bool operator!=(ArrayShape const &) const;
 
   legion_dim_t last_idx() const;
   legion_dim_t neg_idx(int) const;
 
-  std::optional<std::size_t> at_maybe(legion_dim_t) const;
-  std::optional<std::size_t> at_maybe(ff_dim_t) const;
+  std::optional<nonnegative_int> at_maybe(legion_dim_t) const;
+  std::optional<nonnegative_int> at_maybe(ff_dim_t) const;
 
   ArrayShape
       sub_shape(std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
                 std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const;
 
 public:
-  LegionTensorDims dims;
+  LegionOrdered<nonnegative_int> dims;
+
+private:
+  std::tuple<decltype(dims) const &> tie() const;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ArrayShape, dims);
 
-size_t get_volume(ArrayShape const &);
+nonnegative_int get_volume(ArrayShape const &);
 
 TensorShape get_tensor_shape(ArrayShape const &, DataType);
 
diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h
index 3fea92c86b..4b89eb1411 100644
--- a/lib/kernels/include/kernels/batch_norm_kernels.h
+++ b/lib/kernels/include/kernels/batch_norm_kernels.h
@@ -3,46 +3,11 @@
 
 #include "device.h"
 #include "kernels/allocation.h"
+#include "kernels/batch_norm_per_device_state.dtg.h"
 #include "kernels/ff_handle.h"
 #include <memory>
 
 namespace FlexFlow {
-
-struct BatchNormPerDeviceState {
-  PerDeviceFFHandle handle;
-  ffTensorDescriptor_t inputTensor;
-  ffTensorDescriptor_t outputTensor;
-  ffTensorDescriptor_t biasTensor;
-  ffActivationDescriptor_t actiDesc;
-  ffBatchNormMode_t mode;
-  float *runningMean;
-  float *runningVar;
-  float *saveMean;
-  float *saveVar;
-  int output_n;
-  int output_c;
-  int output_h;
-  int output_w;
-  req<bool> relu;
-};
-
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(BatchNormPerDeviceState,
-                                             handle,
-                                             inputTensor,
-                                             outputTensor,
-                                             biasTensor,
-                                             actiDesc,
-                                             mode,
-                                             runningMean,
-                                             runningVar,
-                                             saveMean,
-                                             saveVar,
-                                             output_n,
-                                             output_c,
-                                             output_h,
-                                             output_w,
-                                             relu);
-
 namespace Kernels::BatchNorm {
 
 BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
@@ -55,14 +20,16 @@ BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                     bool relu);
 
 void forward_kernel(ffStream_t stream,
-                    BatchNormPerDeviceState const &m,
+                    BatchNormPerDeviceState const &per_device_statem,
                     float const *input_ptr,
                     float *output_ptr,
                     float const *scale_ptr,
                     float const *bias_ptr);
 
 void backward_kernel(ffStream_t stream,
-                     BatchNormPerDeviceState const &m,
+                     BatchNormPerDeviceState const &per_device_state,
+                     float const *input_ptr,
+                     float *output_grad_ptr,
                      float const *output_ptr,
                      float *output_grad_ptr,
                      float const *input_ptr,
diff --git a/lib/kernels/include/kernels/batch_norm_per_device_state.struct.toml b/lib/kernels/include/kernels/batch_norm_per_device_state.struct.toml
new file mode 100644
index 0000000000..6d2f04f60c
--- /dev/null
+++ b/lib/kernels/include/kernels/batch_norm_per_device_state.struct.toml
@@ -0,0 +1,68 @@
+namespace = "FlexFlow"
+name = "BatchNormPerDeviceState"
+features = []
+
+includes = [
+  "kernels/device.h",
+  "kernels/ff_handle.h",
+]
+
+[[fields]]
+name = "handle"
+type = "::FlexFlow::PerDeviceFFHandle"
+
+[[fields]]
+name = "inputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "outputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "biasTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "actiDesc"
+type = "ffActivationDescriptor_t"
+
+[[fields]]
+name = "mode"
+type = "ffBatchNormMode_t"
+
+[[fields]]
+name = "runningMean"
+type = "float *"
+
+[[fields]]
+name = "runningVar"
+type = "float *"
+
+[[fields]]
+name = "saveMean"
+type = "float *"
+
+[[fields]]
+name = "saveVar"
+type = "float *"
+
+[[fields]]
+name = "output_n"
+type = "int"
+
+[[fields]]
+name = "output_c"
+type = "int"
+
+[[fields]]
+name = "output_h"
+type = "int"
+
+[[fields]]
+name = "output_w"
+type = "int"
+
+[[fields]]
+name = "relu"
+type = "bool"
diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h
index e4dd9723b8..7b9b9c455c 100644
--- a/lib/kernels/include/kernels/legion_dim.h
+++ b/lib/kernels/include/kernels/legion_dim.h
@@ -8,19 +8,23 @@ namespace FlexFlow {
 
 legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value);
 
-legion_dim_t legion_dim_from_ff_dim(ff_dim_t, int num_dimensions);
+legion_dim_t legion_dim_from_ff_dim(ff_dim_t, nonnegative_int num_dimensions);
 
 template <typename T>
 using LegionOrdered = DimOrdered<legion_dim_t, T>;
 
-using LegionTensorDims = LegionOrdered<size_t>;
-
 template <typename T>
 FFOrdered<T>
     ff_ordered_from_legion_ordered(LegionOrdered<T> const &legion_ordered) {
   return FFOrdered<T>(legion_ordered.rbegin(), legion_ordered.rend());
 }
 
+template <typename T>
+LegionOrdered<T>
+    legion_ordered_from_ff_ordered(FFOrdered<T> const &ff_ordered) {
+  return LegionOrdered<T>(ff_ordered.rbegin(), ff_ordered.rend());
+}
+
 template <typename T>
 std::string format_as(LegionOrdered<T> const &v) {
   std::vector<T> as_vec(v.cbegin(), v.cend());
diff --git a/lib/kernels/include/kernels/legion_dim_t.struct.toml b/lib/kernels/include/kernels/legion_dim_t.struct.toml
index d2afb0d73f..6c047f096b 100644
--- a/lib/kernels/include/kernels/legion_dim_t.struct.toml
+++ b/lib/kernels/include/kernels/legion_dim_t.struct.toml
@@ -1,6 +1,5 @@
 namespace = "FlexFlow"
 name = "legion_dim_t"
-
 features = [
   "eq",
   "ord",
@@ -9,6 +8,10 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "value"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/kernels/include/kernels/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml
index f99ff10bb9..0171e3e497 100644
--- a/lib/kernels/include/kernels/per_device_op_state.variant.toml
+++ b/lib/kernels/include/kernels/per_device_op_state.variant.toml
@@ -19,7 +19,6 @@ includes = [
   "kernels/reshape_kernels.h",
   "kernels/softmax_kernels.h",
   "kernels/topk_kernels.h",
-  "kernels/transpose_kernels.h",
 ]
 
 [[values]]
@@ -81,7 +80,3 @@ key = "softmax_per_device_state"
 [[values]]
 type = "::FlexFlow::TopKPerDeviceState"
 key = "topk_per_device_state"
-
-[[values]]
-type = "::FlexFlow::TransposePerDeviceState"
-key = "transpose_per_device_state"
diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h
index dbf78826cb..0ed10ac03d 100644
--- a/lib/kernels/include/kernels/transpose_kernels.h
+++ b/lib/kernels/include/kernels/transpose_kernels.h
@@ -3,31 +3,20 @@
 
 #include "device.h"
 #include "kernels/accessor.h"
+#include "op-attrs/ops/transpose_attrs.dtg.h"
 #include <vector>
 
 namespace FlexFlow {
 
-struct TransposePerDeviceState {
-  int num_dim;
-  req<std::vector<legion_dim_t>> perm;
-};
-
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(TransposePerDeviceState,
-                                             num_dim,
-                                             perm);
-
 namespace Kernels::Transpose {
 
-TransposePerDeviceState init_kernel(int num_dim,
-                                    std::vector<ff_dim_t> const &perm);
-
 void forward_kernel(cudaStream_t stream,
-                    TransposePerDeviceState const &m,
+                    TransposeAttrs const &attrs,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output);
 
 void backward_kernel(cudaStream_t stream,
-                     TransposePerDeviceState const &m,
+                     TransposeAttrs const &attrs,
                      GenericTensorAccessorR const &out_grad,
                      GenericTensorAccessorW const &in_grad);
 
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
index 733146851a..bed8daba51 100644
--- a/lib/kernels/src/allocation.cc
+++ b/lib/kernels/src/allocation.cc
@@ -17,7 +17,8 @@ DeviceType Allocator::get_allocation_device_type() const {
 
 GenericTensorAccessorW
     Allocator::allocate_tensor(TensorShape const &tensor_shape) {
-  void *ptr = this->allocate(get_size_in_bytes(tensor_shape));
+  void *ptr =
+      this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative());
   return {tensor_shape.data_type,
           tensor_shape,
           ptr,
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index 5c18a9ab5a..54534f2ccf 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -1,55 +1,64 @@
 #include "kernels/array_shape.h"
 #include "utils/containers/product.h"
+#include "utils/containers/reversed.h"
+#include "utils/containers/vector_of.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
-static LegionTensorDims
-    legion_dims_from_ff_dims(FFOrdered<size_t> const &ff_ordered) {
-  std::vector<size_t> sizes(ff_ordered.size());
-  std::reverse_copy(ff_ordered.begin(), ff_ordered.end(), sizes.begin());
-  return LegionTensorDims(sizes.begin(), sizes.end());
+static LegionOrdered<nonnegative_int>
+    legion_dims_from_ff_dims(FFOrdered<nonnegative_int> const &ff_ordered) {
+  return LegionOrdered<nonnegative_int>{reversed(vector_of(ff_ordered))};
 }
 
-ArrayShape::ArrayShape(size_t *_dims, size_t num_dims)
-    : dims(_dims, _dims + num_dims) {}
+ArrayShape::ArrayShape(nonnegative_int *_dims, nonnegative_int num_dims)
+    : dims(_dims, _dims + num_dims.unwrap_nonnegative()) {}
 
 ArrayShape::ArrayShape(TensorShape const &shape)
     : dims(legion_dims_from_ff_dims(shape.dims.ff_ordered)) {}
 
-ArrayShape::ArrayShape(std::vector<std::size_t> const &input_dims)
+ArrayShape::ArrayShape(std::vector<nonnegative_int> const &input_dims)
     : dims(input_dims) {}
 
-std::size_t ArrayShape::get_volume() const {
+nonnegative_int ArrayShape::get_volume() const {
   return this->num_elements();
 }
 
-std::size_t ArrayShape::num_dims() const {
-  return this->dims.size();
+nonnegative_int ArrayShape::num_dims() const {
+  return ::FlexFlow::num_elements(this->dims);
 }
 
-std::size_t ArrayShape::get_dim() const {
+nonnegative_int ArrayShape::get_dim() const {
   return this->num_dims();
 }
 
-std::size_t ArrayShape::num_elements() const {
+nonnegative_int ArrayShape::num_elements() const {
   if (dims.size() == 0) {
-    return 0;
+    return 0_n;
   }
   return product(this->dims);
 }
 
-std::size_t ArrayShape::operator[](legion_dim_t idx) const {
+nonnegative_int ArrayShape::operator[](legion_dim_t idx) const {
   return dims.at(idx);
 }
 
-std::size_t ArrayShape::at(legion_dim_t idx) const {
+nonnegative_int ArrayShape::at(legion_dim_t idx) const {
   return dims.at(idx);
 }
 
-std::size_t ArrayShape::at(ff_dim_t idx) const {
+nonnegative_int ArrayShape::at(ff_dim_t idx) const {
   return dims.at(legion_dim_from_ff_dim(idx, this->num_dims()));
 }
 
+bool ArrayShape::operator==(ArrayShape const &other) const {
+  return this->tie() == other.tie();
+}
+
+bool ArrayShape::operator!=(ArrayShape const &other) const {
+  return this->tie() != other.tie();
+}
+
 ArrayShape ArrayShape::sub_shape(
     std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
     std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
@@ -57,7 +66,7 @@ ArrayShape ArrayShape::sub_shape(
   NOT_IMPLEMENTED();
 }
 
-std::optional<std::size_t> ArrayShape::at_maybe(legion_dim_t index) const {
+std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
   if (index.value < dims.size()) {
     return dims.at(index);
   } else {
@@ -65,11 +74,15 @@ std::optional<std::size_t> ArrayShape::at_maybe(legion_dim_t index) const {
   }
 }
 
-std::optional<std::size_t> ArrayShape::at_maybe(ff_dim_t index) const {
+std::optional<nonnegative_int> ArrayShape::at_maybe(ff_dim_t index) const {
   return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims()));
 }
 
-size_t get_volume(ArrayShape const &shape) {
+std::tuple<LegionOrdered<nonnegative_int> const &> ArrayShape::tie() const {
+  return std::tie(this->dims);
+}
+
+nonnegative_int get_volume(ArrayShape const &shape) {
   return shape.get_volume();
 }
 
diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu
index b30cf6a663..4669955019 100644
--- a/lib/kernels/src/cuda/cuda_helper.cu
+++ b/lib/kernels/src/cuda/cuda_helper.cu
@@ -224,10 +224,10 @@ ffStatus_t
       tensor,
       CUDNN_TENSOR_NCHW,
       CUDNN_DATA_FLOAT,
-      shape.at_maybe(legion_dim_t{0}).value_or(1),
-      shape.at_maybe(legion_dim_t{1}).value_or(1),
-      shape.at_maybe(legion_dim_t{2}).value_or(1),
-      shape.at_maybe(legion_dim_t{3}).value_or(1));
+      shape.at_maybe(legion_dim_t{0_n}).value_or(1_n).unwrap_nonnegative(),
+      shape.at_maybe(legion_dim_t{1_n}).value_or(1_n).unwrap_nonnegative(),
+      shape.at_maybe(legion_dim_t{2_n}).value_or(1_n).unwrap_nonnegative(),
+      shape.at_maybe(legion_dim_t{3_n}).value_or(1_n).unwrap_nonnegative());
 }
 
 cudnnDataType_t ff_to_cudnn_datatype(DataType type) {
diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
index 512981e32b..98c528cd7b 100644
--- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
+++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
@@ -145,21 +145,23 @@ BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
         actiDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0));
   }
 
-  BatchNormPerDeviceState per_device_state = {handle,
-                                              inputTensor,
-                                              outputTensor,
-                                              biasTensor,
-                                              actiDesc,
-                                              mode,
-                                              runningMean,
-                                              runningVar,
-                                              saveMean,
-                                              saveVar,
-                                              output_n,
-                                              output_c,
-                                              output_h,
-                                              output_w,
-                                              relu};
+  BatchNormPerDeviceState per_device_state = BatchNormPerDeviceState{
+      handle,
+      inputTensor,
+      outputTensor,
+      biasTensor,
+      actiDesc,
+      mode,
+      runningMean,
+      runningVar,
+      saveMean,
+      saveVar,
+      output_n,
+      output_c,
+      output_h,
+      output_w,
+      relu,
+  };
 
   checkCUDA(cudaStreamDestroy(stream));
   return per_device_state;
diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu
index afc3e1f7ef..230ca70627 100644
--- a/lib/kernels/src/cuda/ops/cast_kernels.cu
+++ b/lib/kernels/src/cuda/ops/cast_kernels.cu
@@ -41,7 +41,7 @@ struct ForwardKernel {
   void operator()(ffStream_t stream,
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-    size_t volume = input.shape.get_volume();
+    size_t volume = input.shape.get_volume().unwrap_nonnegative();
     cast_forward<<<GET_BLOCKS(volume), CUDA_NUM_THREADS, 0, stream>>>(
         input.get<IDT>(), output.get<ODT>(), volume);
   }
@@ -52,7 +52,7 @@ struct BackwardKernel {
   void operator()(ffStream_t stream,
                   GenericTensorAccessorR const &output,
                   GenericTensorAccessorW const &input) {
-    size_t volume = output.shape.get_volume();
+    size_t volume = output.shape.get_volume().unwrap_nonnegative();
     cast_backward<<<GET_BLOCKS(volume), CUDA_NUM_THREADS, 0, stream>>>(
         output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
   }
diff --git a/lib/kernels/src/cuda/ops/combine_kernels.cu b/lib/kernels/src/cuda/ops/combine_kernels.cu
index 98c01d1f7c..7cc67ceed8 100644
--- a/lib/kernels/src/cuda/ops/combine_kernels.cu
+++ b/lib/kernels/src/cuda/ops/combine_kernels.cu
@@ -29,7 +29,8 @@ struct ForwardKernel {
                   GenericTensorAccessorW const &output) {
     checkCUDA(cudaMemcpyAsync(output.get<DT>(),
                               input.get<DT>(),
-                              input.shape.get_volume() * size_of_datatype(DT),
+                              input.shape.get_volume().unwrap_nonnegative() *
+                                  size_of_datatype(DT).unwrap_nonnegative(),
                               cudaMemcpyDeviceToDevice,
                               stream));
   }
@@ -40,7 +41,7 @@ struct BackwardKernel {
   void operator()(ffStream_t stream,
                   GenericTensorAccessorR const &output_grad,
                   GenericTensorAccessorW const &input_grad) {
-    size_t num_elements = output_grad.shape.get_volume();
+    size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative();
     add_kernel<real_type_t<DT>>
         <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
             input_grad.get<DT>(), output_grad.get<DT>(), num_elements);
diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu
index ad216feda2..aa442f5c3d 100644
--- a/lib/kernels/src/cuda/ops/concat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/concat_kernels.cu
@@ -23,8 +23,11 @@ void calc_blk_size(size_t &num_blocks,
                    size_t &blk_size,
                    ArrayShape const &shape,
                    ff_dim_t axis) {
-  blk_size = shape.sub_shape(legion_dim_t{0}, axis).num_elements();
-  num_blocks = shape.sub_shape(axis, std::nullopt).num_elements();
+  blk_size = shape.sub_shape(legion_dim_t{0_n}, axis)
+                 .num_elements()
+                 .unwrap_nonnegative();
+  num_blocks =
+      shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative();
 }
 
 void forward_kernel(cudaStream_t stream,
diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
index 0a4024ba8a..32e749e15a 100644
--- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
+++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
@@ -137,15 +137,15 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
   ffConvolutionBwdFilterAlgo_t bwdFilterAlgo;
   ffConvolutionBwdDataAlgo_t bwdDataAlgo;
 
-  int input_w = input.shape[legion_dim_t(0)];
-  int input_h = input.shape[legion_dim_t(1)];
-  int input_c = input.shape[legion_dim_t(2)];
-  int input_n = input.shape[legion_dim_t(3)];
+  int input_w = input.shape.at(legion_dim_t(0_n)).unwrap_nonnegative();
+  int input_h = input.shape.at(legion_dim_t(1_n)).unwrap_nonnegative();
+  int input_c = input.shape.at(legion_dim_t(2_n)).unwrap_nonnegative();
+  int input_n = input.shape.at(legion_dim_t(3_n)).unwrap_nonnegative();
 
-  int output_w = output.shape[legion_dim_t(0)];
-  int output_h = output.shape[legion_dim_t(1)];
-  int output_c = output.shape[legion_dim_t(2)];
-  int output_n = output.shape[legion_dim_t(3)];
+  int output_w = output.shape.at(legion_dim_t(0_n)).unwrap_nonnegative();
+  int output_h = output.shape.at(legion_dim_t(1_n)).unwrap_nonnegative();
+  int output_c = output.shape.at(legion_dim_t(2_n)).unwrap_nonnegative();
+  int output_n = output.shape.at(legion_dim_t(3_n)).unwrap_nonnegative();
 
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor));
diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
index 687a9fa220..e096803682 100644
--- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu
+++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
@@ -266,7 +266,7 @@ struct ForwardKernel {
                                         output.get<T>()));
     } else if (use_scalar(op_type)) {
       assert(scalar.has_value());
-      size_t num_elements = input.shape.num_elements();
+      size_t num_elements = input.shape.num_elements().unwrap_nonnegative();
       elewise_scalar_unary_forward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements,
@@ -275,7 +275,7 @@ struct ForwardKernel {
               input.get<T>(),
               output.get<T>());
     } else {
-      size_t num_elements = input.shape.num_elements();
+      size_t num_elements = input.shape.num_elements().unwrap_nonnegative();
       elewise_unary_forward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements, op_type, input.get<T>(), output.get<T>());
@@ -312,7 +312,7 @@ struct BackwardKernel {
                                          input_grad.get<T>()));
     } else if (use_scalar(op_type)) {
       assert(scalar.has_value());
-      size_t num_elements = input.shape.num_elements();
+      size_t num_elements = input.shape.num_elements().unwrap_nonnegative();
       elewise_scalar_unary_backward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements,
@@ -323,7 +323,7 @@ struct BackwardKernel {
               input.get<T>(),
               input_grad.get<T>());
     } else {
-      size_t num_elements = input.shape.num_elements();
+      size_t num_elements = input.shape.num_elements().unwrap_nonnegative();
       elewise_unary_backward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements,
diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu
index f661e5fb0a..14bb3bddd1 100644
--- a/lib/kernels/src/cuda/ops/flat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/flat_kernels.cu
@@ -27,7 +27,8 @@ void forward_kernel(cudaStream_t stream,
 
   checkCUDA(cudaMemcpyAsync(output_ptr,
                             input.get_float_ptr(),
-                            (input.shape.num_elements()) * sizeof(float),
+                            input.shape.num_elements().unwrap_nonnegative() *
+                                sizeof(float),
                             cudaMemcpyDeviceToDevice,
                             stream));
 }
@@ -39,8 +40,13 @@ void backward_kernel(cudaStream_t stream,
 
   float alpha = 1.0f;
   apply_add_with_scale<float>
-      <<<GET_BLOCKS(input.shape.num_elements()), CUDA_NUM_THREADS, 0, stream>>>(
-          input_grad_ptr, output_grad_ptr, input.shape.num_elements(), alpha);
+      <<<GET_BLOCKS(input.shape.num_elements().unwrap_nonnegative()),
+         CUDA_NUM_THREADS,
+         0,
+         stream>>>(input_grad_ptr,
+                   output_grad_ptr,
+                   input.shape.num_elements().unwrap_nonnegative(),
+                   alpha);
 }
 
 } // namespace Flat
diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu
index 11c0a1a5e7..31c1bac217 100644
--- a/lib/kernels/src/cuda/ops/gather_kernels.cu
+++ b/lib/kernels/src/cuda/ops/gather_kernels.cu
@@ -128,22 +128,24 @@ void forward_kernel(ffStream_t stream,
 
   coord_t stride =
       output.shape.sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1))
-          .num_elements();
-  coord_t output_dim_size = output.shape[m.legion_dim];
-  coord_t input_dim_size = input.shape[m.legion_dim];
+          .num_elements()
+          .unwrap_nonnegative();
+  coord_t output_dim_size = output.shape.at(m.legion_dim).unwrap_nonnegative();
+  coord_t input_dim_size = input.shape.at(m.legion_dim).unwrap_nonnegative();
 
   assert(index.data_type == DataType::INT32 ||
          index.data_type == DataType::INT64);
 
-  DataTypeDispatch1<ForwardKernel>{}(index.data_type,
-                                     stream,
-                                     input,
-                                     index,
-                                     output,
-                                     output.shape.get_volume(),
-                                     stride,
-                                     input_dim_size,
-                                     output_dim_size);
+  DataTypeDispatch1<ForwardKernel>{}(
+      index.data_type,
+      stream,
+      input,
+      index,
+      output,
+      output.shape.get_volume().unwrap_nonnegative(),
+      stride,
+      input_dim_size,
+      output_dim_size);
 }
 
 void backward_kernel(ffStream_t stream,
@@ -156,22 +158,26 @@ void backward_kernel(ffStream_t stream,
   coord_t stride =
       output_grad.shape
           .sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1))
-          .get_volume();
-  coord_t output_dim_size = output_grad.shape[m.legion_dim];
-  coord_t input_dim_size = input_grad.shape[m.legion_dim];
+          .get_volume()
+          .unwrap_nonnegative();
+  coord_t output_dim_size =
+      output_grad.shape.at(m.legion_dim).unwrap_nonnegative();
+  coord_t input_dim_size =
+      input_grad.shape.at(m.legion_dim).unwrap_nonnegative();
 
   assert(index.data_type == DataType::INT32 ||
          index.data_type == DataType::INT64);
 
-  DataTypeDispatch1<BackwardKernel>{}(index.data_type,
-                                      stream,
-                                      output_grad,
-                                      index,
-                                      input_grad,
-                                      output_grad.shape.get_volume(),
-                                      stride,
-                                      input_dim_size,
-                                      output_dim_size);
+  DataTypeDispatch1<BackwardKernel>{}(
+      index.data_type,
+      stream,
+      output_grad,
+      index,
+      input_grad,
+      output_grad.shape.get_volume().unwrap_nonnegative(),
+      stride,
+      input_dim_size,
+      output_dim_size);
 }
 
 } // namespace Gather
diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu
index 3687c1cedf..67d5c25c3b 100644
--- a/lib/kernels/src/cuda/ops/partition_kernels.cu
+++ b/lib/kernels/src/cuda/ops/partition_kernels.cu
@@ -29,7 +29,8 @@ struct ForwardKernel {
                   GenericTensorAccessorW const &output) {
     checkCUDA(cudaMemcpyAsync(output.get<T>(),
                               input.get<T>(),
-                              input.shape.num_elements() * size_of_datatype(T),
+                              input.shape.num_elements().unwrap_nonnegative() *
+                                  size_of_datatype(T).unwrap_nonnegative(),
                               cudaMemcpyDeviceToDevice,
                               stream));
   }
@@ -41,12 +42,13 @@ struct BackwardKernel {
                   RepartitionPerDeviceState const &m,
                   GenericTensorAccessorR const &output_grad,
                   GenericTensorAccessorW const &input_grad) {
-    add_kernel<real_type_t<T>><<<GET_BLOCKS(input_grad.shape.num_elements()),
-                                 CUDA_NUM_THREADS,
-                                 0,
-                                 stream>>>(input_grad.get<T>(),
-                                           output_grad.get<T>(),
-                                           input_grad.shape.num_elements());
+    add_kernel<real_type_t<T>>
+        <<<GET_BLOCKS(input_grad.shape.num_elements().unwrap_nonnegative()),
+           CUDA_NUM_THREADS,
+           0,
+           stream>>>(input_grad.get<T>(),
+                     output_grad.get<T>(),
+                     input_grad.shape.num_elements().unwrap_nonnegative());
   }
 };
 
diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu
index 9c3e8dcc40..0ef7e304cf 100644
--- a/lib/kernels/src/cuda/ops/reduction_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu
@@ -41,12 +41,13 @@ struct ForwardKernel {
                   GenericTensorAccessorW const &output,
                   size_t num_replicas) {
 
-    size_t total_elements = input.shape.num_elements() * num_replicas;
+    size_t total_elements =
+        input.shape.num_elements().unwrap_nonnegative() * num_replicas;
     reduction_forward_kernel<real_type_t<T>>
         <<<GET_BLOCKS(total_elements), CUDA_NUM_THREADS, 0, stream>>>(
             input.get<T>(),
             output.get<T>(),
-            input.shape.num_elements(),
+            input.shape.num_elements().unwrap_nonnegative(),
             num_replicas);
   }
 };
@@ -58,7 +59,8 @@ struct BackwardKernel {
                   GenericTensorAccessorW const &input) {
     checkCUDA(cudaMemcpyAsync(input.get<T>(),
                               output.get<T>(),
-                              input.shape.num_elements() * size_of_datatype(T),
+                              input.shape.num_elements().unwrap_nonnegative() *
+                                  size_of_datatype(T).unwrap_nonnegative(),
                               cudaMemcpyDeviceToDevice,
                               stream));
   }
diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu
index 1aa61375f0..b4fa5edb89 100644
--- a/lib/kernels/src/cuda/ops/replicate_kernels.cu
+++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu
@@ -41,7 +41,8 @@ struct ForwardKernel {
 
     checkCUDA(cudaMemcpyAsync((void *)output.get<T>(),
                               (void *)input.get<T>(),
-                              input.shape.num_elements() * size_of_datatype(T),
+                              input.shape.num_elements().unwrap_nonnegative() *
+                                  size_of_datatype(T).unwrap_nonnegative(),
                               cudaMemcpyDeviceToDevice,
                               stream));
   }
@@ -53,12 +54,13 @@ struct BackwardKernel {
                   GenericTensorAccessorR const &output,
                   GenericTensorAccessorW const &input,
                   size_t num_replicas) {
-    size_t total_elements = input.shape.num_elements() * num_replicas;
+    size_t total_elements =
+        input.shape.num_elements().unwrap_nonnegative() * num_replicas;
     replicate_backward_kernel<real_type_t<T>>
         <<<GET_BLOCKS(total_elements), CUDA_NUM_THREADS, 0, stream>>>(
             input.get<T>(),
             output.get<T>(),
-            input.shape.num_elements(),
+            input.shape.num_elements().unwrap_nonnegative(),
             num_replicas);
   }
 };
diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu
index b7a328ca08..374dfb22ba 100644
--- a/lib/kernels/src/cuda/ops/reshape_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu
@@ -33,7 +33,8 @@ struct ForwardKernel {
                   GenericTensorAccessorW const &output) {
     checkCUDA(cudaMemcpyAsync(output.get<T>(),
                               input.get<T>(),
-                              input.shape.num_elements() * size_of_datatype(T),
+                              input.shape.num_elements().unwrap_nonnegative() *
+                                  size_of_datatype(T).unwrap_nonnegative(),
                               cudaMemcpyDeviceToDevice,
                               stream));
   }
@@ -46,12 +47,12 @@ struct BackwardKernel {
                   GenericTensorAccessorW const &input) {
     float alpha = 1.0f;
     apply_add_with_scale<real_type_t<T>>
-        <<<GET_BLOCKS(input.shape.num_elements()),
+        <<<GET_BLOCKS(input.shape.num_elements().unwrap_nonnegative()),
            CUDA_NUM_THREADS,
            0,
            stream>>>(input.get<T>(),
                      output.get<T>(),
-                     input.shape.num_elements(),
+                     input.shape.num_elements().unwrap_nonnegative(),
                      static_cast<real_type_t<T>>(alpha));
   }
 };
diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu
index 37e1a08326..e1aaacc7f9 100644
--- a/lib/kernels/src/cuda/ops/transpose_kernels.cu
+++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu
@@ -16,7 +16,9 @@
 #include "device.h"
 #include "kernels/accessor.h"
 #include "kernels/transpose_kernels.h"
+#include "op-attrs/dim_ordered/transform.h"
 #include "utils/exception.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
@@ -29,19 +31,6 @@ struct TransposeStrides {
 namespace Kernels {
 namespace Transpose {
 
-TransposePerDeviceState init_kernel(int num_dim,
-                                    std::vector<ff_dim_t> const &perm) {
-  int const length = perm.size();
-
-  std::vector<legion_dim_t> perm_vector;
-  assert(length <= MAX_TENSOR_DIM);
-  for (int i = 0; i < length; ++i) {
-    perm_vector.push_back(legion_dim_from_ff_dim(perm[i], num_dim));
-  }
-
-  return {num_dim, perm_vector};
-}
-
 __global__ void transpose_simple_kernel(std::size_t volume,
                                         float const *in_ptr,
                                         float *out_ptr,
@@ -59,64 +48,92 @@ __global__ void transpose_simple_kernel(std::size_t volume,
   }
 }
 
+static LegionOrdered<legion_dim_t>
+    legion_ordered_perm_from_ff_ordered(FFOrdered<ff_dim_t> const &perm) {
+  nonnegative_int perm_size = num_elements(perm);
+  LegionOrdered<legion_dim_t> legion_ordered_perm =
+      transform(legion_ordered_from_ff_ordered(perm), [&](ff_dim_t d) {
+        return legion_dim_from_ff_dim(d, perm_size);
+      });
+
+  return legion_ordered_perm;
+}
+
 void forward_kernel(cudaStream_t stream,
-                    TransposePerDeviceState const &m,
+                    TransposeAttrs const &m,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output) {
 
   TransposeStrides info;
-  info.num_dim = input.shape.num_dims();
-  assert(info.num_dim == m.num_dim);
+  info.num_dim = input.shape.num_dims().unwrap_nonnegative();
+  assert(info.num_dim == m.perm.size());
+
+  LegionOrdered<legion_dim_t> legion_ordered_perm =
+      legion_ordered_perm_from_ff_ordered(m.perm);
+
   for (int i = 0; i < info.num_dim; i++) {
     if (i == 0) {
       info.in_strides[i] = 1;
       info.out_strides[i] = 1;
     } else {
-      int in_dim_size = input.shape[legion_dim_t(i)] + 1;
-      int out_dim_size = output.shape[legion_dim_t(i)] + 1;
+      int in_dim_size =
+          input.shape.at(legion_dim_t{nonnegative_int{i}}).unwrap_nonnegative();
+      int out_dim_size = output.shape.at(legion_dim_t{nonnegative_int{i}})
+                             .unwrap_nonnegative();
       info.in_strides[i] = info.in_strides[i - 1] * in_dim_size;
       info.out_strides[i] = info.out_strides[i - 1] * out_dim_size;
     }
-    info.perm[i] = m.perm[i].value;
+
+    info.perm[i] = legion_ordered_perm.at(legion_dim_t{nonnegative_int{i}})
+                       .value.unwrap_nonnegative();
   }
-  transpose_simple_kernel<<<GET_BLOCKS(output.shape.get_volume()),
-                            CUDA_NUM_THREADS,
-                            0,
-                            stream>>>(output.shape.get_volume(),
-                                      input.get_float_ptr(),
-                                      output.get_float_ptr(),
-                                      info,
-                                      0.0f /*beta*/);
+  transpose_simple_kernel<<<
+      GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+      CUDA_NUM_THREADS,
+      0,
+      stream>>>(output.shape.get_volume().unwrap_nonnegative(),
+                input.get_float_ptr(),
+                output.get_float_ptr(),
+                info,
+                0.0f /*beta*/);
 }
 
 void backward_kernel(cudaStream_t stream,
-                     TransposePerDeviceState const &m,
+                     TransposeAttrs const &m,
                      GenericTensorAccessorR const &out_grad,
                      GenericTensorAccessorW const &in_grad) {
 
   TransposeStrides info;
-  info.num_dim = in_grad.shape.num_dims();
-  assert(info.num_dim == m.num_dim);
+  info.num_dim = in_grad.shape.num_dims().unwrap_nonnegative();
+  assert(info.num_dim == m.perm.size());
+
+  LegionOrdered<legion_dim_t> legion_ordered_perm =
+      legion_ordered_perm_from_ff_ordered(m.perm);
+
   for (int i = 0; i < info.num_dim; i++) {
     if (i == 0) {
       info.in_strides[i] = 1;
       info.out_strides[i] = 1;
     } else {
-      int in_dim_size = out_grad.shape[legion_dim_t(i)] + 1;
-      int out_dim_size = in_grad.shape[legion_dim_t(i)] + 1;
+      int in_dim_size = out_grad.shape.at(legion_dim_t{nonnegative_int{i}})
+                            .unwrap_nonnegative();
+      int out_dim_size = in_grad.shape.at(legion_dim_t{nonnegative_int{i}})
+                             .unwrap_nonnegative();
       info.in_strides[i] = info.in_strides[i - 1] * in_dim_size;
       info.out_strides[i] = info.out_strides[i - 1] * out_dim_size;
     }
-    info.perm[m.perm[i].value] = i;
+    info.perm[legion_ordered_perm.at(legion_dim_t{nonnegative_int{i}})
+                  .value.unwrap_nonnegative()] = i;
   }
-  transpose_simple_kernel<<<GET_BLOCKS(in_grad.shape.get_volume()),
-                            CUDA_NUM_THREADS,
-                            0,
-                            stream>>>(in_grad.shape.get_volume(),
-                                      out_grad.get_float_ptr(),
-                                      in_grad.get_float_ptr(),
-                                      info,
-                                      1.0f /*beta*/);
+  transpose_simple_kernel<<<
+      GET_BLOCKS(in_grad.shape.get_volume().unwrap_nonnegative()),
+      CUDA_NUM_THREADS,
+      0,
+      stream>>>(in_grad.shape.get_volume().unwrap_nonnegative(),
+                out_grad.get_float_ptr(),
+                in_grad.get_float_ptr(),
+                info,
+                1.0f /*beta*/);
 }
 
 } // namespace Transpose
diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc
index 142dcbcb2c..bbb15c5636 100644
--- a/lib/kernels/src/legion_dim.cc
+++ b/lib/kernels/src/legion_dim.cc
@@ -3,11 +3,14 @@
 namespace FlexFlow {
 
 legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) {
-  return legion_dim_t(legion_dim.value + value);
+  return legion_dim_t{
+      nonnegative_int{legion_dim.value.unwrap_nonnegative() + value}};
 }
 
-legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, int num_dimensions) {
-  return legion_dim_t(num_dimensions - ff_dim.value.get_value() - 1);
+legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim,
+                                    nonnegative_int num_dimensions) {
+  return legion_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() -
+                                      ff_dim.value.unwrap_nonnegative() - 1}};
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index 023233ecb0..bd0167a677 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -6,11 +6,17 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test multi-head attention kernel") {
-    size_t num_samples = 10;
-    size_t num_heads = 4;
-    size_t qSize = 64, kSize = 64, vSize = 64;
-    size_t qProjSize = 64, kProjSize = 64, vProjSize = 64, oProjSize = 64;
-    size_t qoSeqLength = 20, kvSeqLength = 20;
+    nonnegative_int num_samples = 10_n;
+    nonnegative_int num_heads = 4_n;
+    nonnegative_int qSize = 64_n;
+    nonnegative_int kSize = 64_n;
+    nonnegative_int vSize = 64_n;
+    nonnegative_int qProjSize = 64_n;
+    nonnegative_int kProjSize = 64_n;
+    nonnegative_int vProjSize = 64_n;
+    nonnegative_int oProjSize = 64_n;
+    nonnegative_int qoSeqLength = 20_n;
+    nonnegative_int kvSeqLength = 20_n;
 
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle{
@@ -19,21 +25,21 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    MHAPerDeviceState state =
-        Kernels::MultiHeadAttention::init_kernel(managed_handle.raw_handle(),
-                                                 allocator,
-                                                 num_samples,
-                                                 num_heads,
-                                                 qSize,
-                                                 kSize,
-                                                 vSize,
-                                                 qProjSize,
-                                                 kProjSize,
-                                                 vProjSize,
-                                                 oProjSize,
-                                                 qoSeqLength,
-                                                 kvSeqLength,
-                                                 false);
+    MHAPerDeviceState state = Kernels::MultiHeadAttention::init_kernel(
+        managed_handle.raw_handle(),
+        allocator,
+        /*num_samples=*/num_samples.unwrap_nonnegative(),
+        /*num_heads=*/num_heads.unwrap_nonnegative(),
+        /*qSize=*/qSize.unwrap_nonnegative(),
+        /*kSize=*/kSize.unwrap_nonnegative(),
+        /*vSize=*/vSize.unwrap_nonnegative(),
+        /*qProjSize=*/qProjSize.unwrap_nonnegative(),
+        /*kProjSize=*/kProjSize.unwrap_nonnegative(),
+        /*vProjSize=*/vProjSize.unwrap_nonnegative(),
+        /*oProjSize=*/oProjSize.unwrap_nonnegative(),
+        /*qoSeqLength=*/qoSeqLength.unwrap_nonnegative(),
+        /*kvSeqLength=*/kvSeqLength.unwrap_nonnegative(),
+        /*add_bias_kv=*/false);
 
     TensorShape query_shape = make_tensor_shape_from_legion_dims(
         {qoSeqLength, num_samples, qSize}, DataType::FLOAT);
@@ -43,8 +49,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         {kvSeqLength, num_samples, vSize}, DataType::FLOAT);
     TensorShape output_shape = make_tensor_shape_from_legion_dims(
         {qoSeqLength, num_samples, oProjSize}, DataType::FLOAT);
-    TensorShape weight_shape =
-        make_tensor_shape_from_legion_dims({state.weightSize}, DataType::FLOAT);
+    TensorShape weight_shape = make_tensor_shape_from_legion_dims(
+        {nonnegative_int{state.weightSize}}, DataType::FLOAT);
 
     GenericTensorAccessorW query_accessor =
         create_random_filled_accessor_w(query_shape, allocator);
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index 8a11a069f5..d78d5daee5 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -6,13 +6,13 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test BatchMatmul Kernel") {
-    size_t m = 10;
-    size_t n = 10;
-    size_t k = 10;
-    size_t batch = 5;
-    size_t a_seq_length_dim = -1;
-    size_t b_seq_length_dim = -1;
-    size_t seq_length = -1;
+    nonnegative_int m = 10_n;
+    nonnegative_int n = 10_n;
+    nonnegative_int k = 10_n;
+    nonnegative_int batch = 5_n;
+    int a_seq_length_dim = -1;
+    int b_seq_length_dim = -1;
+    int seq_length = -1;
 
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle{
@@ -41,10 +41,10 @@ TEST_SUITE(FF_TEST_SUITE) {
                                            output_accessor.get_float_ptr(),
                                            a_accessor.get_float_ptr(),
                                            b_accessor.get_float_ptr(),
-                                           m,
-                                           n,
-                                           k,
-                                           batch,
+                                           m.unwrap_nonnegative(),
+                                           n.unwrap_nonnegative(),
+                                           k.unwrap_nonnegative(),
+                                           batch.unwrap_nonnegative(),
                                            a_seq_length_dim,
                                            b_seq_length_dim,
                                            seq_length);
@@ -66,10 +66,10 @@ TEST_SUITE(FF_TEST_SUITE) {
                                             a_grad_accessor.get_float_ptr(),
                                             b_accessor.get_float_ptr(),
                                             b_grad_accessor.get_float_ptr(),
-                                            m,
-                                            n,
-                                            k,
-                                            batch);
+                                            m.unwrap_nonnegative(),
+                                            n.unwrap_nonnegative(),
+                                            k.unwrap_nonnegative(),
+                                            batch.unwrap_nonnegative());
     }
   }
 }
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index 270fad7bb6..d0ec2559ba 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -7,7 +7,10 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test BatchNorm Kernel") {
-    size_t output_n = 1, output_c = 10, output_h = 10, output_w = 10;
+    nonnegative_int output_n = 1_n;
+    nonnegative_int output_c = 10_n;
+    nonnegative_int output_h = 10_n;
+    nonnegative_int output_w = 10_n;
 
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle{
@@ -16,15 +19,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    BatchNormPerDeviceState state =
-        Kernels::BatchNorm::init_kernel(managed_handle.raw_handle(),
-                                        allocator,
-                                        nullptr,
-                                        output_n,
-                                        output_c,
-                                        output_h,
-                                        output_w,
-                                        true);
+    BatchNormPerDeviceState state = Kernels::BatchNorm::init_kernel(
+        /*handle=*/managed_handle.raw_handle(),
+        /*allocator=*/allocator,
+        /*runningMean=*/nullptr,
+        /*output_n=*/output_n.unwrap_nonnegative(),
+        /*output_c=*/output_c.unwrap_nonnegative(),
+        /*output_h=*/output_h.unwrap_nonnegative(),
+        /*output_w=*/output_w.unwrap_nonnegative(),
+        /*relu=*/true);
 
     TensorShape input_shape = make_tensor_shape_from_legion_dims(
         {output_n, output_c, output_h, output_w}, DataType::FLOAT);
@@ -46,12 +49,13 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorW bias_accessor = create_filled_accessor_w(
           bias_shape, allocator, make_float_data_type_value(0));
 
-      Kernels::BatchNorm::forward_kernel(managed_stream.raw_stream(),
-                                         state,
-                                         input_accessor.get_float_ptr(),
-                                         output_accessor.get_float_ptr(),
-                                         scale_accessor.get_float_ptr(),
-                                         bias_accessor.get_float_ptr());
+      Kernels::BatchNorm::forward_kernel(
+          /*stream=*/managed_stream.raw_stream(),
+          /*per_device_state=*/state,
+          /*input_ptr=*/input_accessor.get_float_ptr(),
+          /*output_ptr=*/output_accessor.get_float_ptr(),
+          /*scale_ptr=*/scale_accessor.get_float_ptr(),
+          /*bias_ptr=*/bias_accessor.get_float_ptr());
 
       CHECK(contains_non_zero(output_accessor));
     }
@@ -66,16 +70,18 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorW bias_grad_accessor =
           create_random_filled_accessor_w(bias_shape, allocator);
 
-      Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(),
-                                          state,
-                                          output_accessor.get_float_ptr(),
-                                          output_grad_accessor.get_float_ptr(),
-                                          input_accessor.get_float_ptr(),
-                                          input_grad_accessor.get_float_ptr(),
-                                          scale_accessor.get_float_ptr(),
-                                          scale_grad_accessor.get_float_ptr(),
-                                          bias_grad_accessor.get_float_ptr(),
-                                          input_accessor.shape.num_elements());
+      Kernels::BatchNorm::backward_kernel(
+          /*stream=*/managed_stream.raw_stream(),
+          /*per_device_state=*/state,
+          /*output_ptr=*/output_accessor.get_float_ptr(),
+          /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(),
+          /*input_ptr=*/input_accessor.get_float_ptr(),
+          /*input_grad_ptr=*/input_grad_accessor.get_float_ptr(),
+          /*scale_ptr=*/scale_accessor.get_float_ptr(),
+          /*scale_grad_ptr=*/scale_grad_accessor.get_float_ptr(),
+          /*bias_grad_ptr=*/bias_grad_accessor.get_float_ptr(),
+          /*numElements=*/
+          input_accessor.shape.num_elements().unwrap_nonnegative());
 
       CHECK(contains_non_zero(input_grad_accessor));
       CHECK(contains_non_zero(scale_grad_accessor));
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index 1be5839a9c..2ac27a9747 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -11,9 +11,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({100_n, 100_n}, DataType::FLOAT);
     TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({100, 100}, DataType::DOUBLE);
+        make_tensor_shape_from_legion_dims({100_n, 100_n}, DataType::DOUBLE);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index a4688a1030..91f42669eb 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({100_n, 100_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index 5447b12fc5..3587cecedd 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -6,9 +6,9 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test concat kernel forward and backward") {
-    size_t num_inputs = 2;
-    size_t size_per_input = 10;
-    ff_dim_t concat_axis = ff_dim_t{nonnegative_int{1}};
+    nonnegative_int num_inputs = 2_n;
+    nonnegative_int size_per_input = 10_n;
+    ff_dim_t concat_axis = ff_dim_t{1_n};
 
     ManagedPerDeviceFFHandle managed_handle{
         /*workSpaceSize=*/1024 * 1024,
@@ -24,7 +24,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       std::vector<GenericTensorAccessorR> input_accessors =
-          repeat<GenericTensorAccessorR>(num_inputs, [&]() {
+          repeat(num_inputs, [&]() {
             return read_only_accessor_from_write_accessor(
                 create_random_filled_accessor_w(input_shape, allocator));
           });
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 4be2bdf7bb..ad74fa7d36 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -10,11 +10,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     float dropout_rate = 0.1;
 
     ArrayShape shape = ArrayShape{
-        std::vector<size_t>{10, 10},
+        std::vector<nonnegative_int>{10_n, 10_n},
     };
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({10_n, 10_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     ManagedFFStream managed_stream{};
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index bbeb349ced..238c4ac361 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     ManagedFFStream managed_stream{};
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     GenericTensorAccessorR input_accessor =
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 7f97563217..b75614588c 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -12,12 +12,13 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    GatherPerDeviceState state = {managed_handle.raw_handle(), legion_dim_t(2)};
+    GatherPerDeviceState state = {managed_handle.raw_handle(),
+                                  legion_dim_t{2_n}};
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
     TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({50}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({50_n}, DataType::FLOAT);
 
     GenericTensorAccessorR index_accessor =
         create_random_filled_accessor_r(output_shape, allocator);
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 80a046fe37..8368fe4efd 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -7,8 +7,8 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test LayerNorm Forward and Backward Kernel") {
-    size_t batch_size = 10;
-    size_t feature_size = 10;
+    nonnegative_int batch_size = 10_n;
+    nonnegative_int feature_size = 10_n;
     float epsilon = 1e-5f;
     bool elementwise_affine = true;
 
@@ -29,8 +29,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         Kernels::LayerNorm::init_kernel(managed_handle.raw_handle(),
                                         allocator,
                                         elementwise_affine,
-                                        batch_size,
-                                        feature_size,
+                                        batch_size.unwrap_nonnegative(),
+                                        feature_size.unwrap_nonnegative(),
                                         epsilon);
 
     GenericTensorAccessorR input_accessor =
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
index 25264b7a58..c1be78bd16 100644
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ b/lib/kernels/test/src/test_partition_kernel.cc
@@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         managed_handle.raw_handle(), DataType::FLOAT);
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({10_n, 10_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index eb0702a970..ff74f6fb28 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -6,10 +6,20 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Pool2D Forward and Backward Kernel") {
-    size_t input_w = 10, input_h = 10, input_c = 3, input_n = 1;
-    size_t output_w = 5, output_h = 5, output_c = 3, output_n = 1;
-    size_t pad_h = 0, pad_w = 0, kernel_h = 2, kernel_w = 2, stride_h = 2,
-           stride_w = 2;
+    nonnegative_int input_w = 10_n;
+    nonnegative_int input_h = 10_n;
+    nonnegative_int input_c = 3_n;
+    nonnegative_int input_n = 1_n;
+    nonnegative_int output_w = 5_n;
+    nonnegative_int output_h = 5_n;
+    nonnegative_int output_c = 3_n;
+    nonnegative_int output_n = 1_n;
+    nonnegative_int pad_h = 0_n;
+    nonnegative_int pad_w = 0_n;
+    nonnegative_int kernel_h = 2_n;
+    nonnegative_int kernel_w = 2_n;
+    nonnegative_int stride_h = 2_n;
+    nonnegative_int stride_w = 2_n;
 
     PoolOp pool_type = PoolOp::MAX;
 
@@ -21,23 +31,23 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     Pool2DPerDeviceState state =
-        Kernels::Pool2D::init_kernel(managed_handle.raw_handle(),
-                                     std::nullopt,
-                                     input_w,
-                                     input_h,
-                                     input_c,
-                                     input_n,
-                                     output_w,
-                                     output_h,
-                                     output_c,
-                                     output_n,
-                                     pad_h,
-                                     pad_w,
-                                     kernel_h,
-                                     kernel_w,
-                                     stride_h,
-                                     stride_w,
-                                     pool_type);
+        Kernels::Pool2D::init_kernel(/*handle=*/managed_handle.raw_handle(),
+                                     /*activation=*/std::nullopt,
+                                     /*input_w=*/input_w.unwrap_nonnegative(),
+                                     /*input_h=*/input_h.unwrap_nonnegative(),
+                                     /*input_c=*/input_c.unwrap_nonnegative(),
+                                     /*input_n=*/input_n.unwrap_nonnegative(),
+                                     /*output_w=*/output_w.unwrap_nonnegative(),
+                                     /*output_h=*/output_h.unwrap_nonnegative(),
+                                     /*output_c=*/output_c.unwrap_nonnegative(),
+                                     /*output_n=*/output_n.unwrap_nonnegative(),
+                                     /*pad_h=*/pad_h.unwrap_nonnegative(),
+                                     /*pad_w=*/pad_w.unwrap_nonnegative(),
+                                     /*kernel_h=*/kernel_h.unwrap_nonnegative(),
+                                     /*kernel_w=*/kernel_w.unwrap_nonnegative(),
+                                     /*stride_h=*/stride_h.unwrap_nonnegative(),
+                                     /*stride_w=*/stride_w.unwrap_nonnegative(),
+                                     /*pool_type=*/pool_type);
 
     TensorShape input_shape = make_tensor_shape_from_legion_dims(
         {input_w, input_h, input_c, input_n}, DataType::FLOAT);
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index a33748c0de..5078edee57 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -9,7 +9,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     std::size_t num_replicas = 5;
 
     TensorShape input_shape = make_tensor_shape_from_legion_dims(
-        {10, 10, 10, 10, 10}, DataType::FLOAT);
+        {10_n, 10_n, 10_n, 10_n, 10_n}, DataType::FLOAT);
 
     ManagedPerDeviceFFHandle managed_handle{
         /*workSpaceSize=*/1024 * 1024,
@@ -20,7 +20,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       TensorShape output_shape =
-          make_tensor_shape_from_legion_dims({10}, DataType::FLOAT);
+         
+          make_tensor_shape_from_legion_dims({10_n}, DataType::FLOAT);
 
       GenericTensorAccessorR input_accessor =
           create_random_filled_accessor_r(input_shape, allocator);
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 27223cc7b5..5133c4c89c 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -6,12 +6,12 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Call Replicate Forward and Backward Kernels") {
-    std::size_t num_replicas = 10;
+    nonnegative_int num_replicas = 10_n;
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
     TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
 
     ManagedPerDeviceFFHandle managed_handle{
         /*workSpaceSize=*/1024 * 1024,
@@ -48,12 +48,12 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") {
-    std::size_t num_replicas = 2;
+    nonnegative_int num_replicas = 10_n;
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({5}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({5_n}, DataType::FLOAT);
     TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({5_n, num_replicas}, DataType::FLOAT);
 
     ManagedPerDeviceFFHandle managed_handle{
         /*workSpaceSize=*/1024 * 1024,
diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc
index 5c04012da2..ee7530c017 100644
--- a/lib/kernels/test/src/test_reshape_kernel.cc
+++ b/lib/kernels/test/src/test_reshape_kernel.cc
@@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     ReshapePerDeviceState state =
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index c06919d603..bf23188a8f 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -7,9 +7,9 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Call Reverse Forward and Backward Kernels") {
-    std::size_t reverse_dim_size = 10;
-    std::size_t in_blk_size = 10;
-    std::size_t num_out_blks = 1;
+    nonnegative_int reverse_dim_size = 10_n;
+    nonnegative_int in_blk_size = 10_n;
+    nonnegative_int num_out_blks = 1_n;
 
     TensorShape input_shape = make_tensor_shape_from_legion_dims(
         {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT);
@@ -29,13 +29,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
-      Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
-                                       input_accessor.get_float_ptr(),
-                                       output_accessor.get_float_ptr(),
-                                       num_out_blks,
-                                       reverse_dim_size,
-                                       in_blk_size,
-                                       input_accessor.shape.num_elements());
+      Kernels::Reverse::forward_kernel(
+          managed_stream.raw_stream(),
+          input_accessor.get_float_ptr(),
+          output_accessor.get_float_ptr(),
+          num_out_blks.unwrap_nonnegative(),
+          reverse_dim_size.unwrap_nonnegative(),
+          in_blk_size.unwrap_nonnegative(),
+          input_accessor.shape.num_elements().unwrap_nonnegative());
 
       CHECK(contains_non_zero(output_accessor));
     }
@@ -50,19 +51,19 @@ TEST_SUITE(FF_TEST_SUITE) {
           managed_stream.raw_stream(),
           output_grad_accessor.get_float_ptr(),
           input_grad_accessor.get_float_ptr(),
-          num_out_blks,
-          reverse_dim_size,
-          in_blk_size,
-          input_grad_accessor.shape.num_elements());
+          num_out_blks.unwrap_nonnegative(),
+          reverse_dim_size.unwrap_nonnegative(),
+          in_blk_size.unwrap_nonnegative(),
+          input_grad_accessor.shape.num_elements().unwrap_nonnegative());
 
       CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 
   TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") {
-    std::size_t num_out_blks = 4;
-    std::size_t reverse_dim_size = 3;
-    std::size_t in_blk_size = 2;
+    nonnegative_int num_out_blks = 4_n;
+    nonnegative_int reverse_dim_size = 3_n;
+    nonnegative_int in_blk_size = 2_n;
 
     TensorShape input_shape = make_tensor_shape_from_legion_dims(
         {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT);
@@ -90,10 +91,10 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
                                        input_accessor_gpu.get_float_ptr(),
                                        output_accessor_gpu.get_float_ptr(),
-                                       num_out_blks,
-                                       reverse_dim_size,
-                                       in_blk_size,
-                                       input_accessor_gpu.shape.num_elements());
+                                       num_out_blks.unwrap_nonnegative(),
+                                       reverse_dim_size.unwrap_nonnegative(),
+                                       in_blk_size.unwrap_nonnegative(),
+                                       input_accessor_gpu.shape.num_elements().unwrap_nonnegative());
 
       // Run CPU Cast Forward Kernel
       GenericTensorAccessorR input_accessor_cpu =
@@ -118,10 +119,10 @@ TEST_SUITE(FF_TEST_SUITE) {
           managed_stream.raw_stream(),
           output_grad_accessor_gpu.get_float_ptr(),
           input_grad_accessor_gpu.get_float_ptr(),
-          num_out_blks,
-          reverse_dim_size,
-          in_blk_size,
-          input_grad_accessor_gpu.shape.num_elements());
+          num_out_blks.unwrap_nonnegative(),
+          reverse_dim_size.unwrap_nonnegative(),
+          in_blk_size.unwrap_nonnegative(),
+          input_grad_accessor_gpu.shape.num_elements().unwrap_nonnegative());
 
       // Run CPU Cast Backward Kernel
       GenericTensorAccessorR output_grad_accessor_cpu =
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index 5519c30b80..d4fb496f7b 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -6,7 +6,11 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Softmax Kernel Operations") {
-    int input_n = 1, input_c = 1, input_h = 1, input_w = 100, channels = 100;
+    nonnegative_int input_n = 1_n;
+    nonnegative_int input_c = 1_n;
+    nonnegative_int input_h = 1_n;
+    nonnegative_int input_w = 100_n;
+    nonnegative_int channels = 100_n;
 
     ManagedPerDeviceFFHandle managed_handle{
         /*workSpaceSize=*/1024 * 1024,
@@ -16,11 +20,16 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
-    SoftmaxPerDeviceState state = Kernels::Softmax::init_kernel(
-        managed_handle.raw_handle(), 0, input_n, channels, input_h, input_w);
+    SoftmaxPerDeviceState state =
+        Kernels::Softmax::init_kernel(managed_handle.raw_handle(),
+                                      0,
+                                      input_n.unwrap_nonnegative(),
+                                      channels.unwrap_nonnegative(),
+                                      input_h.unwrap_nonnegative(),
+                                      input_w.unwrap_nonnegative());
 
     GenericTensorAccessorW output_accessor =
         create_random_filled_accessor_w(output_shape, allocator);
@@ -47,7 +56,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           managed_stream.raw_stream(),
           output_grad_accessor.get_float_ptr(),
           input_grad_accessor.get_float_ptr(),
-          output_grad_accessor.shape.num_elements());
+          output_grad_accessor.shape.num_elements().unwrap_nonnegative());
 
       CHECK(contains_non_zero(input_grad_accessor));
     }
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index e94d102b71..d98f88a30e 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -8,7 +8,7 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Split Forward and Backward Kernel") {
-    size_t num_outputs = 2;
+    nonnegative_int num_outputs = 2_n;
     coord_t out_blk_sizes[] = {50, 50};
     coord_t in_blk_size = 100;
     coord_t num_blks = 1;
@@ -21,9 +21,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
     TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({50}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({50_n}, DataType::FLOAT);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
@@ -41,11 +41,11 @@ TEST_SUITE(FF_TEST_SUITE) {
                                      out_blk_sizes,
                                      in_blk_size,
                                      num_blks,
-                                     num_outputs);
+                                     num_outputs.unwrap_nonnegative());
     }
 
     SUBCASE("backward_kernel") {
-      std::vector<float *> output_grad_ptrs(num_outputs);
+      std::vector<float *> output_grad_ptrs(num_outputs.unwrap_nonnegative());
       for (int i = 0; i < num_outputs; i++) {
         GenericTensorAccessorW output_grad_accessor =
             create_random_filled_accessor_w(output_shape, allocator);
@@ -61,7 +61,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                       out_blk_sizes,
                                       in_blk_size,
                                       num_blks,
-                                      num_outputs);
+                                      num_outputs.unwrap_nonnegative());
     }
   }
 }
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index f87fb67921..cac43c6ff3 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -5,10 +5,12 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Transpose Kernel Operations") {
-    std::size_t num_dims = 2;
-
-    std::vector<ff_dim_t> perm = {ff_dim_t{nonnegative_int{0}},
-                                  ff_dim_t{nonnegative_int{1}}};
+    TransposeAttrs attrs = TransposeAttrs{
+        FFOrdered<ff_dim_t>{
+            ff_dim_t{0_n},
+            ff_dim_t{1_n},
+        },
+    };
 
     ManagedPerDeviceFFHandle managed_handle{
         /*workSpaceSize=*/1024 * 1024,
@@ -17,11 +19,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TransposePerDeviceState state =
-        Kernels::Transpose::init_kernel(num_dims, perm);
-
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({10_n, 10_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
@@ -31,7 +30,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           allocator.allocate_tensor(output_shape);
 
       Kernels::Transpose::forward_kernel(
-          managed_stream.raw_stream(), state, input_accessor, output_accessor);
+          managed_stream.raw_stream(), attrs, input_accessor, output_accessor);
 
       CHECK(contains_non_zero(output_accessor));
     }
@@ -43,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           create_random_filled_accessor_w(input_shape, allocator);
 
       Kernels::Transpose::backward_kernel(managed_stream.raw_stream(),
-                                          state,
+                                          attrs,
                                           output_grad_accessor,
                                           input_grad_accessor);
 
diff --git a/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml b/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml
index 5f73bbbb8e..db476e771d 100644
--- a/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml
+++ b/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml
@@ -6,7 +6,7 @@ features = [
 
 includes = [
   "kernels/attention_kernels.h",
-  "kernels/batch_norm_kernels.h",
+  "kernels/batch_norm_per_device_state.dtg.h",
   "kernels/conv_2d_kernels.h",
   "kernels/dropout_kernels.h",
   "kernels/element_binary_kernels.h",
@@ -84,7 +84,3 @@ key = "device_specific_softmax_per_device_state"
 [[values]]
 type = "::FlexFlow::DeviceSpecific<::FlexFlow::TopKPerDeviceState>"
 key = "device_specific_topk_per_device_state"
-
-[[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::TransposePerDeviceState>"
-key = "device_specific_transpose_per_device_state"
diff --git a/lib/local-execution/include/local-execution/legion_tensor_shape.h b/lib/local-execution/include/local-execution/legion_tensor_shape.h
deleted file mode 100644
index 3786383865..0000000000
--- a/lib/local-execution/include/local-execution/legion_tensor_shape.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_TENSOR_SHAPE_H
-#define _FLEXFLOW_RUNTIME_SRC_TENSOR_SHAPE_H
-
-#include "kernels/legion_dim.h"
-#include "op-attrs/datatype.h"
-#include "op-attrs/ff_dim_t.h"
-#include "op-attrs/tensor_shape.dtg.h"
-#include "utils/stack_vector/stack_vector.h"
-#include "utils/visitable.h"
-#include <cstddef>
-
-namespace FlexFlow {
-
-// TODO FIXME @lockshaw remove inheritance from legion tensor dims
-struct LegionTensorShape : public use_visitable_cmp<LegionTensorShape>,
-                           public LegionTensorDims {
-  LegionTensorShape() = delete;
-  LegionTensorShape(std::vector<size_t> const &dims, DataType data_type);
-  LegionTensorShape(TensorShape const &);
-
-  template <size_t MAXSIZE>
-  LegionTensorShape(stack_vector<size_t, MAXSIZE> const &dims,
-                    DataType data_type)
-      : LegionTensorDims(dims.start(), dims.end()), data_type(data_type) {}
-
-  operator TensorShape() const;
-
-public:
-  DataType data_type;
-};
-
-ff_dim_t to_ff(legion_dim_t, size_t num_dims);
-legion_dim_t legion_dim_from_ff_dim(ff_dim_t, size_t num_dims);
-
-ff_dim_t to_ff(legion_dim_t, TensorShape const &);
-legion_dim_t legion_dim_from_ff_dim(ff_dim_t, TensorShape const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/task_id_t.enum.toml b/lib/local-execution/include/local-execution/task_id_t.enum.toml
index 9cbe64c268..b0c82b5d26 100644
--- a/lib/local-execution/include/local-execution/task_id_t.enum.toml
+++ b/lib/local-execution/include/local-execution/task_id_t.enum.toml
@@ -205,9 +205,6 @@ name = "TOPK_FWD_TASK_ID"
 [[values]]
 name = "TOPK_BWD_TASK_ID"
 
-[[values]]
-name = "TRANSPOSE_INIT_TASK_ID"
-
 [[values]]
 name = "TRANSPOSE_FWD_TASK_ID"
 
diff --git a/lib/local-execution/src/legion_tensor_shape.cc b/lib/local-execution/src/legion_tensor_shape.cc
deleted file mode 100644
index b227accc2e..0000000000
--- a/lib/local-execution/src/legion_tensor_shape.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "local-execution/legion_tensor_shape.h"
-#include "kernels/legion_dim.h"
-#include "op-attrs/tensor_shape.h"
-
-namespace FlexFlow {
-
-legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, size_t num_dims) {
-  return legion_dim_t(num_dims - ff_dim.value.get_value() - 1);
-}
-
-legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, TensorShape const &shape) {
-  return legion_dim_from_ff_dim(ff_dim, num_dims(shape));
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc
index eebef9039d..e652b666a8 100644
--- a/lib/local-execution/src/ops/attention.cc
+++ b/lib/local-execution/src/ops/attention.cc
@@ -85,10 +85,10 @@ static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<MultiHeadAttentionAttrs>(ATTRS);
   Allocator allocator = acc.get_allocator();
-  size_t qProjSize = acc.get_argument<int>(QPROJSIZE);
-  size_t kProjSize = acc.get_argument<int>(KPROJSIZE);
-  size_t vProjSize = acc.get_argument<int>(VPROJSIZE);
-  size_t oProjSize = acc.get_argument<int>(OPROJSIZE);
+  nonnegative_int qProjSize = acc.get_argument<nonnegative_int>(QPROJSIZE);
+  nonnegative_int kProjSize = acc.get_argument<nonnegative_int>(KPROJSIZE);
+  nonnegative_int vProjSize = acc.get_argument<nonnegative_int>(VPROJSIZE);
+  nonnegative_int oProjSize = acc.get_argument<nonnegative_int>(OPROJSIZE);
 
   PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
   ParallelTensorShape query_parallel_tensor_shape =
@@ -108,29 +108,30 @@ static DeviceSpecificDeviceStates
                                             key_parallel_tensor_shape,
                                             value_parallel_tensor_shape));
 
-  int kvSeqLength = get_kvSeqLength(parsed);
-  int qSize = get_qSize(parsed);
-  int kSize = get_kSize(parsed);
-  int vSize = get_vSize(parsed);
-
-  int qoSeqLength = get_qoSeqLength(parsed);
-  int num_samples = get_num_samples(parsed);
-  int num_heads = attrs.num_heads;
-
-  MHAPerDeviceState per_device_state = init_kernel(handle,
-                                                   allocator,
-                                                   num_samples,
-                                                   num_heads,
-                                                   qSize,
-                                                   kSize,
-                                                   vSize,
-                                                   qProjSize,
-                                                   kProjSize,
-                                                   vProjSize,
-                                                   oProjSize,
-                                                   qoSeqLength,
-                                                   kvSeqLength,
-                                                   attrs.add_bias_kv);
+  nonnegative_int kvSeqLength = get_kvSeqLength(parsed);
+  nonnegative_int qSize = get_qSize(parsed);
+  nonnegative_int kSize = get_kSize(parsed);
+  nonnegative_int vSize = get_vSize(parsed);
+
+  nonnegative_int qoSeqLength = get_qoSeqLength(parsed);
+  nonnegative_int num_samples = get_num_samples(parsed);
+  nonnegative_int num_heads = attrs.num_heads;
+
+  MHAPerDeviceState per_device_state =
+      init_kernel(handle,
+                  allocator,
+                  num_samples.unwrap_nonnegative(),
+                  num_heads.unwrap_nonnegative(),
+                  qSize.unwrap_nonnegative(),
+                  kSize.unwrap_nonnegative(),
+                  vSize.unwrap_nonnegative(),
+                  qProjSize.unwrap_nonnegative(),
+                  kProjSize.unwrap_nonnegative(),
+                  vProjSize.unwrap_nonnegative(),
+                  oProjSize.unwrap_nonnegative(),
+                  qoSeqLength.unwrap_nonnegative(),
+                  kvSeqLength.unwrap_nonnegative(),
+                  attrs.add_bias_kv);
   return DeviceSpecificDeviceStates{
       DeviceSpecific<MHAPerDeviceState>::create(per_device_state)};
 }
diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc
index 1eae409ae2..ad331156b5 100644
--- a/lib/local-execution/src/ops/batch_matmul.cc
+++ b/lib/local-execution/src/ops/batch_matmul.cc
@@ -18,6 +18,8 @@
 #include "local-execution/op_task_signature.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/batch_matmul.h"
+#include "utils/containers/transform.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 
 namespace FlexFlow {
 
@@ -65,24 +67,30 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   FFIterationConfig iter_config =
       acc.get_argument<FFIterationConfig>(ITERATION_CONFIG);
 
-  int m = b_input.shape[legion_dim_t(0)];
-  assert(m == output.shape[legion_dim_t(0)]);
-  int n = a_input.shape[legion_dim_t(1)];
-  assert(n == output.shape[legion_dim_t(1)]);
-  int k = a_input.shape[legion_dim_t(0)];
-  assert(k == b_input.shape[legion_dim_t(1)]);
+  nonnegative_int m = b_input.shape.at(legion_dim_t{0_n});
+  assert(m == output.shape.at(legion_dim_t{0_n}));
+  nonnegative_int n = a_input.shape.at(legion_dim_t{1_n});
+  assert(n == output.shape.at(legion_dim_t{1_n}));
+  nonnegative_int k = a_input.shape.at(legion_dim_t{0_n});
+  assert(k == b_input.shape.at(legion_dim_t{1_n}));
 
   assert(a_input.shape.get_volume() == b_input.shape.get_volume());
   assert(a_input.shape.get_volume() == output.shape.get_volume());
 
-  int batch = 1;
-  for (int i = 2; i < a_input.shape.get_dim(); i++) {
-    int dim_size = a_input.shape[legion_dim_t(i)];
-    assert(dim_size == b_input.shape[legion_dim_t(i)]);
-    assert(dim_size == output.shape[legion_dim_t(i)]);
+  nonnegative_int batch = 1_n;
+  for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.get_dim())) {
+    nonnegative_int dim_size = a_input.shape.at(legion_dim_t{i});
+    assert(dim_size == b_input.shape.at(legion_dim_t{i}));
+    assert(dim_size == output.shape.at(legion_dim_t{i}));
     batch *= dim_size;
   }
 
+  auto get_raw_seq_len = [](std::optional<nonnegative_int> seq_len) -> int {
+    return transform(seq_len,
+                     [](nonnegative_int x) { return x.unwrap_nonnegative(); })
+        .value_or(-1);
+  };
+
   return profile(forward_kernel,
                  profiling,
                  "[BatchMatmul] forward_time = {:.2lf}ms\n",
@@ -90,12 +98,12 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output.get_float_ptr(),
                  a_input.get_float_ptr(),
                  b_input.get_float_ptr(),
-                 m,
-                 n,
-                 k,
-                 batch,
-                 attrs.a_seq_length_dim,
-                 attrs.b_seq_length_dim,
+                 m.unwrap_nonnegative(),
+                 n.unwrap_nonnegative(),
+                 k.unwrap_nonnegative(),
+                 batch.unwrap_nonnegative(),
+                 get_raw_seq_len(attrs.a_seq_length_dim),
+                 get_raw_seq_len(attrs.b_seq_length_dim),
                  iter_config.seq_length);
 }
 
@@ -120,19 +128,20 @@ static std::optional<float>
   assert(b_input.shape == b_input_grad.shape);
 
   // check dins
-  int m = b_input.shape[legion_dim_t(0)];
-  assert(m == output.shape[legion_dim_t(0)]);
-  int n = a_input.shape[legion_dim_t(1)];
-  assert(n == output.shape[legion_dim_t(1)]);
-  int k = a_input.shape[legion_dim_t(0)];
-  assert(k == b_input.shape[legion_dim_t(1)]);
+  nonnegative_int m = b_input.shape.at(legion_dim_t{0_n});
+  assert(m == output.shape.at(legion_dim_t{0_n}));
+  nonnegative_int n = a_input.shape.at(legion_dim_t{1_n});
+  assert(n == output.shape.at(legion_dim_t{1_n}));
+  nonnegative_int k = a_input.shape.at(legion_dim_t{0_n});
+  assert(k == b_input.shape.at(legion_dim_t{1_n}));
   assert(a_input.shape.get_volume() == b_input.shape.get_volume());
   assert(a_input.shape.get_volume() == output.shape.get_volume());
-  int batch = 1;
-  for (int i = 2; i < a_input.shape.dims.num_dims(); i++) {
-    int dim_size = a_input.shape[legion_dim_t(i)];
-    assert(dim_size == b_input.shape[legion_dim_t(i)]);
-    assert(dim_size == output.shape[legion_dim_t(i)]);
+
+  nonnegative_int batch = 1_n;
+  for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.get_dim())) {
+    nonnegative_int dim_size = a_input.shape.at(legion_dim_t{i});
+    assert(dim_size == b_input.shape.at(legion_dim_t{i}));
+    assert(dim_size == output.shape.at(legion_dim_t{i}));
     batch *= dim_size;
   }
 
@@ -146,10 +155,10 @@ static std::optional<float>
                  a_input_grad.get_float_ptr(),
                  b_input.get_float_ptr(),
                  b_input_grad.get_float_ptr(),
-                 m,
-                 n,
-                 k,
-                 batch);
+                 m.unwrap_nonnegative(),
+                 n.unwrap_nonnegative(),
+                 k.unwrap_nonnegative(),
+                 batch.unwrap_nonnegative());
 }
 
 TaskImplFunction get_batch_matmul_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/batch_matmul.h b/lib/local-execution/src/ops/batch_matmul.h
index a7e29b1931..23389d5083 100644
--- a/lib/local-execution/src/ops/batch_matmul.h
+++ b/lib/local-execution/src/ops/batch_matmul.h
@@ -4,7 +4,7 @@
 #include "local-execution/op_task_invocation.h"
 #include "local-execution/op_task_signature.h"
 #include "local-execution/sim_environment.h"
-#include "op-attrs/ops/batch_matmul.dtg.h"
+#include "op-attrs/ops/batch_matmul_attrs.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc
index 3aed3111c7..5cf8742918 100644
--- a/lib/local-execution/src/ops/batch_norm.cc
+++ b/lib/local-execution/src/ops/batch_norm.cc
@@ -75,21 +75,22 @@ static DeviceSpecificDeviceStates
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto const &attrs = acc.get_argument<BatchNormAttrs>(ATTRS);
 
-  int output_w = output.shape[legion_dim_t(0)];
-  int output_h = output.shape[legion_dim_t(1)];
-  int output_c = output.shape[legion_dim_t(2)];
-  int output_n = output.shape[legion_dim_t(3)];
+  nonnegative_int output_w = output.shape.at(legion_dim_t{0_n});
+  nonnegative_int output_h = output.shape.at(legion_dim_t{1_n});
+  nonnegative_int output_c = output.shape.at(legion_dim_t{2_n});
+  nonnegative_int output_n = output.shape.at(legion_dim_t{3_n});
 
   float *runningMean;
 
-  BatchNormPerDeviceState per_device_state = init_kernel(handle,
-                                                         allocator,
-                                                         runningMean,
-                                                         output_n,
-                                                         output_c,
-                                                         output_h,
-                                                         output_w,
-                                                         attrs.relu);
+  BatchNormPerDeviceState per_device_state =
+      init_kernel(handle,
+                  allocator,
+                  runningMean,
+                  output_n.unwrap_nonnegative(),
+                  output_c.unwrap_nonnegative(),
+                  output_h.unwrap_nonnegative(),
+                  output_w.unwrap_nonnegative(),
+                  attrs.relu);
 
   return DeviceSpecificDeviceStates{
       DeviceSpecific<BatchNormPerDeviceState>::create(per_device_state)};
@@ -140,7 +141,7 @@ static std::optional<float>
                  scale.get_float_ptr(),
                  scale_grad.get_float_ptr(),
                  bias_grad.get_float_ptr(),
-                 output.shape.get_volume());
+                 output.shape.get_volume().unwrap_nonnegative());
 }
 
 TaskImplFunction get_batch_norm_init_task_impl() {
diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc
index d7c5c22170..c4d9c4b21d 100644
--- a/lib/local-execution/src/ops/conv_2d.cc
+++ b/lib/local-execution/src/ops/conv_2d.cc
@@ -62,19 +62,19 @@ static DeviceSpecificDeviceStates
   auto filter_grad = acc.get_tensor_grad<Permissions::RW>(FILTER);
 
   Conv2DPerDeviceState per_device_state =
-      init_kernel(handle,
-                  attrs.activation,
-                  attrs.kernel_h,
-                  attrs.kernel_w,
-                  attrs.groups,
-                  attrs.padding_h,
-                  attrs.padding_w,
-                  attrs.stride_h,
-                  attrs.stride_w,
-                  input,
-                  output,
-                  filter.get_float_ptr(),
-                  filter_grad.get_float_ptr());
+      init_kernel(/*handle=*/handle,
+                  /*activation=*/attrs.activation,
+                  /*kernel_h=*/attrs.kernel_h.unwrap_nonnegative(),
+                  /*kernel_w=*/attrs.kernel_w.unwrap_nonnegative(),
+                  /*groups=*/attrs.groups.unwrap_nonnegative(),
+                  /*padding_h=*/attrs.padding_h.unwrap_nonnegative(),
+                  /*padding_w=*/attrs.padding_w.unwrap_nonnegative(),
+                  /*stride_h=*/attrs.stride_h.unwrap_nonnegative(),
+                  /*stride_w=*/attrs.stride_w.unwrap_nonnegative(),
+                  /*input=*/input,
+                  /*output=*/output,
+                  /*filter_ptr=*/filter.get_float_ptr(),
+                  /*filter_grad_ptr=*/filter_grad.get_float_ptr());
   return DeviceSpecificDeviceStates{
       DeviceSpecific<Conv2DPerDeviceState>::create(per_device_state)};
 }
diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc
index a015c64f4d..a43c0f757f 100644
--- a/lib/local-execution/src/ops/gather.cc
+++ b/lib/local-execution/src/ops/gather.cc
@@ -15,8 +15,8 @@
 
 #include "gather.h"
 #include "kernels/gather_kernels.h"
-#include "local-execution/legion_tensor_shape.h"
 #include "op-attrs/get_output_shapes.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 #include <optional>
 
 namespace FlexFlow {
@@ -72,10 +72,11 @@ static DeviceSpecificDeviceStates
   assert(input.shape.get_dim() == index.shape.get_dim());
   assert(output.shape.get_dim() == index.shape.get_dim());
 
-  for (int i = 0; i < input.shape.get_dim(); i++) {
-    assert(index.shape[legion_dim_t(i)] == output.shape[legion_dim_t(i)]);
+  for (nonnegative_int i : nonnegative_range(input.shape.get_dim())) {
+    assert(index.shape.at(legion_dim_t{i}) == output.shape.at(legion_dim_t{i}));
     if (i != legion_dim.value) {
-      assert(input.shape[legion_dim_t(i)] == index.shape[legion_dim_t(i)]);
+      assert(input.shape.at(legion_dim_t{i}) ==
+             index.shape.at(legion_dim_t{i}));
     }
   }
 
diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc
index e99d27319c..c01475d4a4 100644
--- a/lib/local-execution/src/ops/layer_norm.cc
+++ b/lib/local-execution/src/ops/layer_norm.cc
@@ -15,12 +15,12 @@
 
 #include "layer_norm.h"
 #include "kernels/layer_norm_kernels.h"
-#include "local-execution/legion_tensor_shape.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/layer_norm.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 #include <type_traits>
 
 namespace FlexFlow {
@@ -119,27 +119,25 @@ static DeviceSpecificDeviceStates
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
 
-  // question: how to get batch_size and effective_num_elements
-  int64_t effective_batch_size, effective_num_elements;
-  int M = 1;
+  nonnegative_int M = 1_n;
   for (int i = 0; i < attrs.axes.size(); i++) {
-    legion_dim_t legion_dim = legion_dim_from_ff_dim(
-        attrs.axes[i], get_tensor_shape(input.shape, input.data_type));
+    legion_dim_t legion_dim =
+        legion_dim_from_ff_dim(attrs.axes[i], input.shape.num_dims());
     M *= input.shape.at(legion_dim);
   }
-  int num_replicas = 1;
-  for (int i = 0; i < input.shape.num_dims(); i++) {
-    num_replicas *= input.shape.at(legion_dim_t(i));
-    effective_num_elements = M;
-    effective_batch_size = input.shape.get_volume() / M;
+  nonnegative_int num_replicas = 1_n;
+  for (nonnegative_int i : nonnegative_range(input.shape.num_dims())) {
+    num_replicas *= input.shape.at(legion_dim_t{i});
   }
+  nonnegative_int effective_num_elements = M;
+  nonnegative_int effective_batch_size = input.shape.get_volume() / M;
 
   LayerNormPerDeviceState per_device_state =
       init_kernel(handle,
                   allocator,
                   attrs.elementwise_affine,
-                  effective_batch_size,
-                  effective_num_elements,
+                  effective_batch_size.unwrap_nonnegative(),
+                  effective_num_elements.unwrap_nonnegative(),
                   attrs.eps);
   return DeviceSpecificDeviceStates{
       DeviceSpecific<LayerNormPerDeviceState>::create(per_device_state)};
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
index 1eb0360db4..2de850f209 100644
--- a/lib/local-execution/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -66,21 +66,22 @@ static DeviceSpecificDeviceStates
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  int out_dim = output.shape.at(ff_dim_t{nonnegative_int{0}});
-  int batch_size = output.shape.at(ff_dim_t{nonnegative_int{1}});
+  nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n});
+  nonnegative_int batch_size = output.shape.at(ff_dim_t{1_n});
 
   float *one_ptr;
 
-  LinearPerDeviceState per_device_state = init_kernel(handle,
-                                                      one_ptr,
-                                                      attrs.activation,
-                                                      attrs.regularizer,
-                                                      attrs.use_bias,
-                                                      input.data_type,
-                                                      weight.data_type,
-                                                      output.data_type,
-                                                      batch_size,
-                                                      attrs.out_channels);
+  LinearPerDeviceState per_device_state =
+      init_kernel(handle,
+                  one_ptr,
+                  attrs.activation,
+                  attrs.regularizer,
+                  attrs.use_bias,
+                  input.data_type,
+                  weight.data_type,
+                  output.data_type,
+                  batch_size.unwrap_nonnegative(),
+                  attrs.out_channels.unwrap_nonnegative());
   return DeviceSpecificDeviceStates{
       DeviceSpecific<LinearPerDeviceState>::create(per_device_state)};
 }
@@ -96,9 +97,9 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
 
-  int in_dim = input.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
-  int out_dim = output.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
-  int batch_size = output.shape.get_volume() / out_dim;
+  nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n});
+  nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n});
+  nonnegative_int batch_size = output.shape.get_volume() / out_dim;
 
   float const *bias_ptr = NULL;
   if (attrs.use_bias) {
@@ -113,9 +114,9 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output.get_float_ptr(),
                  weight.get_float_ptr(),
                  bias_ptr,
-                 in_dim,
-                 out_dim,
-                 batch_size);
+                 in_dim.unwrap_nonnegative(),
+                 out_dim.unwrap_nonnegative(),
+                 batch_size.unwrap_nonnegative());
 }
 
 ;
@@ -140,9 +141,9 @@ static std::optional<float>
     bias_ptr = bias.get_float_ptr();
   }
 
-  int in_dim = input.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
-  int out_dim = output.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
-  int batch_size = output.shape.get_volume() / out_dim;
+  nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n});
+  nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n});
+  nonnegative_int batch_size = output.shape.get_volume() / out_dim;
 
   return profile(backward_kernel,
                  profiling,
@@ -155,9 +156,9 @@ static std::optional<float>
                  weight.get_float_ptr(),
                  weight_grad.get_float_ptr(),
                  bias_ptr,
-                 in_dim,
-                 out_dim,
-                 batch_size);
+                 in_dim.unwrap_nonnegative(),
+                 out_dim.unwrap_nonnegative(),
+                 batch_size.unwrap_nonnegative());
 }
 
 TaskImplFunction get_linear_init_task_impl() {
diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc
index a1167a731c..13f6a78381 100644
--- a/lib/local-execution/src/ops/pool_2d.cc
+++ b/lib/local-execution/src/ops/pool_2d.cc
@@ -22,6 +22,20 @@ OpTaskInvocation init(Pool2DAttrs const &attrs) {
   return {task_id_t::POOL2D_INIT_TASK_ID, binding};
 }
 
+static nonnegative_int calculate_padding(nonnegative_int output_size,
+                                         nonnegative_int stride,
+                                         nonnegative_int kernel_size,
+                                         nonnegative_int input_size) {
+  int o = output_size.unwrap_nonnegative();
+  int s = stride.unwrap_nonnegative();
+  int k = kernel_size.unwrap_nonnegative();
+  int i = kernel_size.unwrap_nonnegative();
+
+  return nonnegative_int{
+      ((o - 1) * s + k - i + 1) / 2,
+  };
+}
+
 static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<Pool2DAttrs>(ATTRS);
@@ -30,56 +44,33 @@ static DeviceSpecificDeviceStates
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  int input_w = input.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
-  int input_h = input.shape.at(ff_dim_t{nonnegative_int{1}}) + 1;
-  int input_c = input.shape.at(ff_dim_t{nonnegative_int{2}}) + 1;
-  int input_n = input.shape.at(ff_dim_t{nonnegative_int{3}}) + 1;
-  int output_w = output.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
-  int output_h = output.shape.at(ff_dim_t{nonnegative_int{1}}) + 1;
-  int output_c = output.shape.at(ff_dim_t{nonnegative_int{2}}) + 1;
-  int output_n = output.shape.at(ff_dim_t{nonnegative_int{3}}) + 1;
-
-  printf("init pool (input): n(%d) c(%d) h(%d) "
-         "w(%d)\n",
-         input_n,
-         input_c,
-         input_h,
-         input_w);
-  printf("init pool (output): n(%d) c(%d) h(%d) w(%d)\n",
-         output_n,
-         output_c,
-         output_h,
-         output_w);
-
-  int pad_h =
-      ((output_h - 1) * attrs.stride_h + attrs.kernel_h - input_h + 1) / 2;
-  int pad_w =
-      ((output_w - 1) * attrs.stride_w + attrs.kernel_w - input_w + 1) / 2;
-  if (pad_h != attrs.padding_h) {
-    printf("Warning: changing pool_padding_h to satisfy output_h size\n");
-  }
-
-  if (pad_w != attrs.padding_w) {
-    printf("Warning: changing pool_padding_w to satisfy output_w size\n");
-  }
-
-  Pool2DPerDeviceState per_device_state = init_kernel(handle,
-                                                      attrs.activation,
-                                                      input_w,
-                                                      input_h,
-                                                      input_c,
-                                                      input_n,
-                                                      output_w,
-                                                      output_h,
-                                                      output_c,
-                                                      output_n,
-                                                      pad_h,
-                                                      pad_w,
-                                                      attrs.kernel_h,
-                                                      attrs.kernel_w,
-                                                      attrs.stride_h,
-                                                      attrs.stride_w,
-                                                      attrs.pool_type);
+  nonnegative_int input_w = input.shape.at(ff_dim_t{0_n});
+  nonnegative_int input_h = input.shape.at(ff_dim_t{1_n});
+  nonnegative_int input_c = input.shape.at(ff_dim_t{2_n});
+  nonnegative_int input_n = input.shape.at(ff_dim_t{3_n});
+  nonnegative_int output_w = output.shape.at(ff_dim_t{0_n});
+  nonnegative_int output_h = output.shape.at(ff_dim_t{1_n});
+  nonnegative_int output_c = output.shape.at(ff_dim_t{2_n});
+  nonnegative_int output_n = output.shape.at(ff_dim_t{3_n});
+
+  Pool2DPerDeviceState per_device_state =
+      init_kernel(handle,
+                  attrs.activation,
+                  input_w.unwrap_nonnegative(),
+                  input_h.unwrap_nonnegative(),
+                  input_c.unwrap_nonnegative(),
+                  input_n.unwrap_nonnegative(),
+                  output_w.unwrap_nonnegative(),
+                  output_h.unwrap_nonnegative(),
+                  output_c.unwrap_nonnegative(),
+                  output_n.unwrap_nonnegative(),
+                  attrs.padding_h.unwrap_nonnegative(),
+                  attrs.padding_w.unwrap_nonnegative(),
+                  attrs.kernel_h.unwrap_nonnegative(),
+                  attrs.kernel_w.unwrap_nonnegative(),
+                  attrs.stride_h.unwrap_nonnegative(),
+                  attrs.stride_w.unwrap_nonnegative(),
+                  attrs.pool_type);
 
   return DeviceSpecificDeviceStates{
       DeviceSpecific<Pool2DPerDeviceState>::create(per_device_state)};
diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc
index a043d9f847..3f92d7fd77 100644
--- a/lib/local-execution/src/ops/reduce.cc
+++ b/lib/local-execution/src/ops/reduce.cc
@@ -41,9 +41,14 @@ static DeviceSpecificDeviceStates
 
   OperatorType op_type = attrs.op_type;
 
-  size_t reduction_size = input.shape.get_volume() / output.shape.get_volume();
+  nonnegative_int reduction_size =
+      input.shape.get_volume() / output.shape.get_volume();
   ReducePerDeviceState per_device_state =
-      init_kernel(handle, op_type, reduction_size, input.shape, output.shape);
+      init_kernel(handle,
+                  op_type,
+                  reduction_size.unwrap_nonnegative(),
+                  input.shape,
+                  output.shape);
   return DeviceSpecificDeviceStates{
       DeviceSpecific<ReducePerDeviceState>::create(per_device_state)};
 }
diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc
index 1e85d7186e..cab7c3e22d 100644
--- a/lib/local-execution/src/ops/reduction.cc
+++ b/lib/local-execution/src/ops/reduction.cc
@@ -50,14 +50,14 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<ReductionAttrs>(ATTRS);
 
-  size_t num_replicas = attrs.reduction_degree;
+  nonnegative_int num_replicas = attrs.reduction_degree;
 
   return profile(forward_kernel,
                  profiling_settings,
                  "[Reduction] forward_time = {:.2lf}ms\n",
                  input,
                  output,
-                 num_replicas);
+                 num_replicas.unwrap_nonnegative());
 }
 
 static std::optional<float>
diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc
index 56bbfdd371..17e0065de5 100644
--- a/lib/local-execution/src/ops/replicate.cc
+++ b/lib/local-execution/src/ops/replicate.cc
@@ -62,14 +62,14 @@ static std::optional<float>
 
   auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-  auto const &attrs = acc.get_argument<ReplicateAttrs>(ATTRS);
+  auto attrs = acc.get_argument<ReplicateAttrs>(ATTRS);
 
   return profile(backward_kernel,
                  profiling,
                  "[replicate] backward_time = {:.2lf}ms\n",
                  output_grad,
                  input_grad,
-                 attrs.replicate_degree);
+                 attrs.replicate_degree.unwrap_nonnegative());
 }
 
 TaskImplFunction get_replicate_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc
index 8ac4c045c7..94dfc90f7a 100644
--- a/lib/local-execution/src/ops/reverse.cc
+++ b/lib/local-execution/src/ops/reverse.cc
@@ -17,6 +17,7 @@
 #include "kernels/accessor.h"
 #include "kernels/reverse_kernels.h"
 #include "op-attrs/get_output_shapes.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 
 namespace FlexFlow {
 
@@ -48,16 +49,18 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
-  int output_size = output.shape.get_volume();
+  nonnegative_int output_size = output.shape.get_volume();
   auto axis = attrs.axis;
-  coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1;
-  for (int i = 0; i < output.shape.get_dim(); i++) {
+  nonnegative_int in_blk_size = 1_n;
+  nonnegative_int reverse_dim_size = 1_n;
+  nonnegative_int num_out_blks = 1_n;
+  for (nonnegative_int i : nonnegative_range(output.shape.get_dim())) {
     if (i < axis.value) {
-      in_blk_size *= output.shape.at(ff_dim_t{nonnegative_int{i}});
+      in_blk_size *= output.shape.at(ff_dim_t{i});
     } else if (i == axis.value) {
-      reverse_dim_size = output.shape.at(ff_dim_t{nonnegative_int{i}});
+      reverse_dim_size = output.shape.at(ff_dim_t{i});
     } else {
-      num_out_blks *= output.shape.at(ff_dim_t{nonnegative_int{i}});
+      num_out_blks *= output.shape.at(ff_dim_t{i});
     }
   }
 
@@ -66,10 +69,10 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  "[reverse] forward_time = {:.2lf}ms\n",
                  input.get_float_ptr(),
                  output.get_float_ptr(),
-                 num_out_blks,
-                 reverse_dim_size,
-                 in_blk_size,
-                 output_size);
+                 num_out_blks.unwrap_nonnegative(),
+                 reverse_dim_size.unwrap_nonnegative(),
+                 in_blk_size.unwrap_nonnegative(),
+                 output_size.unwrap_nonnegative());
 }
 
 static std::optional<float>
@@ -79,15 +82,18 @@ static std::optional<float>
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
-  int axis = input_grad.shape.get_dim() - attrs.axis.value.get_value() - 1;
-  coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1;
-  for (int i = 0; i < input_grad.shape.get_dim(); i++) {
+  int axis = input_grad.shape.num_dims().unwrap_nonnegative() -
+             attrs.axis.value.unwrap_nonnegative() - 1;
+  nonnegative_int in_blk_size = 1_n;
+  nonnegative_int reverse_dim_size = 1_n;
+  nonnegative_int num_out_blks = 1_n;
+  for (nonnegative_int i : nonnegative_range(input_grad.shape.get_dim())) {
     if (i < axis) {
-      in_blk_size *= input_grad.shape.at(ff_dim_t{nonnegative_int{i}});
+      in_blk_size *= input_grad.shape.at(ff_dim_t{i});
     } else if (i == axis) {
-      reverse_dim_size = input_grad.shape.at(ff_dim_t{nonnegative_int{i}});
+      reverse_dim_size = input_grad.shape.at(ff_dim_t{i});
     } else {
-      num_out_blks *= input_grad.shape.at(ff_dim_t{nonnegative_int{i}});
+      num_out_blks *= input_grad.shape.at(ff_dim_t{i});
     }
   }
 
@@ -96,10 +102,10 @@ static std::optional<float>
                  "[reverse] backward_time = {:.2lf}ms\n",
                  output_grad.get_float_ptr(),
                  input_grad.get_float_ptr(),
-                 num_out_blks,
-                 reverse_dim_size,
-                 in_blk_size,
-                 input_grad.shape.get_volume());
+                 num_out_blks.unwrap_nonnegative(),
+                 reverse_dim_size.unwrap_nonnegative(),
+                 in_blk_size.unwrap_nonnegative(),
+                 input_grad.shape.get_volume().unwrap_nonnegative());
 }
 
 TaskImplFunction get_reverse_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc
index 71a6ce435e..a1f29e2c98 100644
--- a/lib/local-execution/src/ops/softmax.cc
+++ b/lib/local-execution/src/ops/softmax.cc
@@ -59,18 +59,18 @@ static DeviceSpecificDeviceStates
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto const &attrs = acc.get_argument<SoftmaxAttrs>(ATTRS);
 
-  int output_w = output.shape.at(legion_dim_t(0));
-  int output_h = output.shape.at(legion_dim_t(1));
-  int output_c = output.shape.at(legion_dim_t(2));
-  int output_n = output.shape.at(legion_dim_t(3));
+  nonnegative_int output_w = output.shape.at(legion_dim_t{0_n});
+  nonnegative_int output_h = output.shape.at(legion_dim_t{1_n});
+  nonnegative_int output_c = output.shape.at(legion_dim_t{2_n});
+  nonnegative_int output_n = output.shape.at(legion_dim_t{3_n});
 
   SoftmaxPerDeviceState per_device_state =
       init_kernel(handle,
-                  attrs.dim.value.get_value(),
-                  output_n,
-                  output_c,
-                  output_h,
-                  output_w);
+                  attrs.dim.value.unwrap_nonnegative(),
+                  output_n.unwrap_nonnegative(),
+                  output_c.unwrap_nonnegative(),
+                  output_h.unwrap_nonnegative(),
+                  output_w.unwrap_nonnegative());
 
   return DeviceSpecificDeviceStates{
       DeviceSpecific<SoftmaxPerDeviceState>::create(per_device_state)};
@@ -109,7 +109,7 @@ static std::optional<float>
                  "[SoftMax] backward_time = {:.2lf}ms\n",
                  output_grad.get_float_ptr(),
                  input_grad.get_float_ptr(),
-                 output_grad.shape.get_volume());
+                 output_grad.shape.get_volume().unwrap_nonnegative());
 }
 
 TaskImplFunction get_softmax_init_task_impl() {
diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/ops/split.cc
index c289bca205..f119ae235b 100644
--- a/lib/local-execution/src/ops/split.cc
+++ b/lib/local-execution/src/ops/split.cc
@@ -19,6 +19,7 @@
 #include "op-attrs/get_output_shapes.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 
 namespace FlexFlow {
 
@@ -44,19 +45,18 @@ OpTaskInvocation backward(SplitAttrs const &attrs) {
   return {task_id_t::SPLIT_BWD_TASK_ID, binding};
 }
 
-void calc_block_size(coord_t &num_blocks,
-                     coord_t &block_size,
-                     ArrayShape const &array_shape,
-                     ff_dim_t axis) {
-  num_blocks = 1;
-  block_size = 1;
-  for (int d = 0; d < array_shape.num_elements(); d++) {
-    if (d <= axis.value.get_value()) {
-      block_size *= array_shape.at(legion_dim_t(d));
+static std::pair<nonnegative_int, nonnegative_int>
+    calc_block_size(ArrayShape const &array_shape, ff_dim_t axis) {
+  nonnegative_int num_blocks = 1_n;
+  nonnegative_int block_size = 1_n;
+  for (nonnegative_int d : nonnegative_range(array_shape.num_elements())) {
+    if (d <= axis.value) {
+      block_size *= array_shape.at(legion_dim_t{d});
     } else {
-      num_blocks *= array_shape.at(legion_dim_t(d));
+      num_blocks *= array_shape.at(legion_dim_t{d});
     }
   }
+  return {num_blocks, block_size};
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
@@ -65,13 +65,12 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
 
-  coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS];
-  calc_block_size(num_blocks, in_block_size, input.shape, attrs.axis);
+  coord_t out_block_sizes[MAX_NUM_OUTPUTS];
+  auto [num_blocks, in_block_size] = calc_block_size(input.shape, attrs.axis);
 
   for (int i = 0; i < attrs.splits.size(); i++) {
-    coord_t out_num_blocks;
-    calc_block_size(
-        out_num_blocks, out_block_size[i], output.shape, attrs.axis);
+    auto [_, out_block_size] = calc_block_size(output.shape, attrs.axis);
+    out_block_sizes[i] = out_block_size.unwrap_nonnegative();
   }
   float *output_float_ptr = output.get_float_ptr();
   return profile(forward_kernel,
@@ -79,9 +78,9 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  "Split forward_time = {:.2lf}ms\n",
                  &output_float_ptr,
                  input.get_float_ptr(),
-                 out_block_size,
-                 in_block_size,
-                 num_blocks,
+                 out_block_sizes,
+                 in_block_size.unwrap_nonnegative(),
+                 num_blocks.unwrap_nonnegative(),
                  attrs.splits.size());
 }
 
@@ -93,12 +92,14 @@ static std::optional<float>
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
 
-  coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS];
-  calc_block_size(num_blocks, in_block_size, input_grad.shape, attrs.axis);
+  coord_t out_block_sizes[MAX_NUM_OUTPUTS];
+  auto [num_blocks, in_block_size] =
+      calc_block_size(input_grad.shape, attrs.axis);
+
   for (int i = 0; i < attrs.splits.size(); i++) {
     coord_t out_num_blocks;
-    calc_block_size(
-        out_num_blocks, out_block_size[i], output_grad.shape, attrs.axis);
+    auto [_, out_block_size] = calc_block_size(output_grad.shape, attrs.axis);
+    out_block_sizes[i] = out_block_size.unwrap_nonnegative();
   }
   float const *output_grad_ptr = output_grad.get_float_ptr();
   return profile(backward_kernel,
@@ -106,9 +107,9 @@ static std::optional<float>
                  "Split backward_time = {:.2lf}ms\n",
                  input_grad.get_float_ptr(),
                  &output_grad_ptr,
-                 out_block_size,
-                 in_block_size,
-                 num_blocks,
+                 out_block_sizes,
+                 in_block_size.unwrap_nonnegative(),
+                 num_blocks.unwrap_nonnegative(),
                  attrs.splits.size());
 }
 
diff --git a/lib/local-execution/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc
index 7f3519529a..e9d202a38f 100644
--- a/lib/local-execution/src/ops/topk.cc
+++ b/lib/local-execution/src/ops/topk.cc
@@ -75,8 +75,8 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  int length = input.shape.at(legion_dim_t(0)) + 1;
-  size_t batch_size = input.shape.get_volume() / length;
+  nonnegative_int length = input.shape.at(legion_dim_t{0_n});
+  nonnegative_int batch_size = input.shape.get_volume() / length;
   auto indices = acc.get_tensor<Permissions::WO>(INDICES);
 
   return profile(forward_kernel,
@@ -86,9 +86,9 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  input.get_float_ptr(),
                  output.get_float_ptr(),
                  indices.get_int32_ptr(),
-                 batch_size,
-                 length,
-                 attrs.k,
+                 batch_size.unwrap_nonnegative(),
+                 length.unwrap_nonnegative(),
+                 attrs.k.unwrap_nonnegative(),
                  attrs.sorted);
 }
 
@@ -104,8 +104,8 @@ static std::optional<float>
 
   auto indices = acc.get_tensor<Permissions::RO>(INDICES);
 
-  int length = input_grad.shape.at(legion_dim_t(0)) + 1;
-  size_t batch_size = input_grad.shape.get_volume() / length;
+  nonnegative_int length = input_grad.shape.at(legion_dim_t{0_n});
+  nonnegative_int batch_size = input_grad.shape.get_volume() / length;
 
   return profile(backward_kernel,
                  profiling,
@@ -114,9 +114,9 @@ static std::optional<float>
                  output_grad.get_float_ptr(),
                  indices.get_int32_ptr(),
                  input_grad.get_float_ptr(),
-                 batch_size,
-                 length,
-                 attrs.k);
+                 batch_size.unwrap_nonnegative(),
+                 length.unwrap_nonnegative(),
+                 attrs.k.unwrap_nonnegative());
 }
 
 TaskImplFunction get_topk_init_task_impl() {
diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc
index 30310d3349..0769cbb76f 100644
--- a/lib/local-execution/src/ops/transpose.cc
+++ b/lib/local-execution/src/ops/transpose.cc
@@ -28,39 +28,11 @@ enum Slots {
   OUTPUT, // tensor
   ATTRS,
   PROFILING,
-  PER_DEVICE_STATE,
 };
 
-OpTaskInvocation init(TransposeAttrs const &attrs) {
-  OpTaskBinding binding;
-  binding.bind_arg(ATTRS, attrs);
-  return {task_id_t::TRANSPOSE_INIT_TASK_ID, binding};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto const &attrs = acc.get_argument<TransposeAttrs>(ATTRS);
-  int size = int_from_size_t(attrs.perm.size());
-
-  std::vector<ff_dim_t> perm = [&] {
-    std::vector<ff_dim_t> result;
-    for (int i : range(size)) {
-      result.push_back(ff_dim_t{nonnegative_int{size - i - 1}});
-    }
-    return result;
-  }();
-
-  TransposePerDeviceState per_device_state = init_kernel(size, perm);
-
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<TransposePerDeviceState>::create(per_device_state)};
-}
-
 OpTaskInvocation forward(TransposeAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<TransposePerDeviceState>());
   binding.bind_arg(PROFILING, profiling_settings());
 
   binding.bind(INPUT, input_tensor(0));
@@ -71,8 +43,7 @@ OpTaskInvocation forward(TransposeAttrs const &attrs) {
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto per_device_state =
-      acc.get_argument<TransposePerDeviceState>(PER_DEVICE_STATE);
+  auto attrs = acc.get_argument<TransposeAttrs>(ATTRS);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
@@ -80,7 +51,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   return profile(forward_kernel,
                  profiling,
                  "[Transpose] Forward_time = {:.2lf} [ms]",
-                 per_device_state,
+                 attrs,
                  input,
                  output);
 }
@@ -88,8 +59,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto per_device_state =
-      acc.get_argument<TransposePerDeviceState>(PER_DEVICE_STATE);
+  auto attrs = acc.get_argument<TransposeAttrs>(ATTRS);
 
   auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
@@ -97,7 +67,7 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[Transpose] Backward_time = {:.2lf} [ms]",
-                 per_device_state,
+                 attrs,
                  output_grad,
                  input_grad);
 }
@@ -108,42 +78,31 @@ OpTaskInvocation backward(TransposeAttrs const &attrs) {
   return {task_id_t::TRANSPOSE_BWD_TASK_ID, binding};
 }
 
-TaskImplFunction get_transpose_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
-}
 TaskImplFunction get_transpose_fwd_task_impl() {
   return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
 }
+
 TaskImplFunction get_transpose_bwd_task_impl() {
   return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
 }
 
-OpTaskSignature get_transpose_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_arg_slot<TransposeAttrs>(ATTRS);
-  init.add_return_value<TransposePerDeviceState>();
-  return init;
-}
 OpTaskSignature get_transpose_fwd_signature() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_unchecked_arg_slot<TransposePerDeviceState>(PER_DEVICE_STATE);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
   return fwd;
 }
+
 OpTaskSignature get_transpose_bwd_signature() {
   OpTaskSignature bwd = infer_bwd_signature(get_transpose_fwd_signature());
   return bwd;
 }
 
 std::vector<task_id_t> get_task_ids(TransposeAttrs const &) {
-  return {task_id_t::TRANSPOSE_INIT_TASK_ID,
-          task_id_t::TRANSPOSE_FWD_TASK_ID,
-          task_id_t::TRANSPOSE_BWD_TASK_ID};
+  return {task_id_t::TRANSPOSE_FWD_TASK_ID, task_id_t::TRANSPOSE_BWD_TASK_ID};
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/ops/transpose.h b/lib/local-execution/src/ops/transpose.h
index 0f3a2e80a0..f2ce014aa7 100644
--- a/lib/local-execution/src/ops/transpose.h
+++ b/lib/local-execution/src/ops/transpose.h
@@ -9,15 +9,12 @@ namespace FlexFlow {
 
 std::vector<task_id_t> get_task_ids(TransposeAttrs const &);
 
-TaskImplFunction get_transpose_init_task_impl();
 TaskImplFunction get_transpose_fwd_task_impl();
 TaskImplFunction get_transpose_bwd_task_impl();
 
-OpTaskSignature get_transpose_init_signature();
 OpTaskSignature get_transpose_fwd_signature();
 OpTaskSignature get_transpose_bwd_signature();
 
-OpTaskInvocation init(TransposeAttrs const &);
 OpTaskInvocation forward(TransposeAttrs const &);
 OpTaskInvocation backward(TransposeAttrs const &);
 
diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/local-execution/src/task_signature_impl.cc
index ca428aad25..60928d42d7 100644
--- a/lib/local-execution/src/task_signature_impl.cc
+++ b/lib/local-execution/src/task_signature_impl.cc
@@ -193,9 +193,6 @@ TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) {
     case task_id_t::TOPK_BWD_TASK_ID:
       return TaskSignatureAndImpl{get_topk_bwd_task_impl(),
                                   get_topk_bwd_signature()};
-    case task_id_t::TRANSPOSE_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_transpose_init_task_impl(),
-                                  get_transpose_init_signature()};
     case task_id_t::TRANSPOSE_FWD_TASK_ID:
       return TaskSignatureAndImpl{get_transpose_fwd_task_impl(),
                                   get_transpose_fwd_signature()};
@@ -296,7 +293,6 @@ OpTaskInvocation init(ComputationGraphOpAttrs const &op) {
       [](ReshapeAttrs const &attrs) { return init(attrs); },
       [](SoftmaxAttrs const &attrs) { return init(attrs); },
       [](TopKAttrs const &attrs) { return init(attrs); },
-      [](TransposeAttrs const &attrs) { return init(attrs); },
       [](auto const &attrs) -> OpTaskInvocation {
         throw mk_runtime_error(fmt::format("Unhandled attr type {}", attrs));
       },
diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc
index 1ec441fbca..46827e3981 100644
--- a/lib/local-execution/test/src/test_local_slots_backing.cc
+++ b/lib/local-execution/test/src/test_local_slots_backing.cc
@@ -19,16 +19,17 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("LocalSlotsBacking -- Attention Op") {
     // allocate input memory
     Allocator allocator = create_local_cpu_memory_allocator();
-    int embed_dim = 32;
-    int num_heads = 10;
+    nonnegative_int embed_dim = 32_n;
+    nonnegative_int num_heads = 10_n;
 
-    size_t batch_size = 40;
-    size_t seq_len = 48;
-    size_t feature_size = 36;
+    nonnegative_int batch_size = 40_n;
+    nonnegative_int seq_len = 48_n;
+    nonnegative_int feature_size = 36_n;
 
     DataType dtype = DataType::FLOAT;
     TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{batch_size, seq_len, feature_size}},
+        TensorDims{
+            FFOrdered<nonnegative_int>{batch_size, seq_len, feature_size}},
         DataType::FLOAT,
     };
     TensorShape query_shape = input_tensor_shape;
diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
index f52fccb1ed..0fab0f6a60 100644
--- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc
+++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
@@ -9,16 +9,17 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("LocalTaskArgumentAccessor") {
     Allocator allocator = create_local_cpu_memory_allocator();
-    int embed_dim = 32;
-    int num_heads = 10;
+    nonnegative_int embed_dim = 32_n;
+    nonnegative_int num_heads = 10_n;
 
-    size_t batch_size = 40;
-    size_t seq_len = 48;
-    size_t feature_size = 36;
+    nonnegative_int batch_size = 40_n;
+    nonnegative_int seq_len = 48_n;
+    nonnegative_int feature_size = 36_n;
 
     DataType dtype = DataType::FLOAT;
     TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{batch_size, seq_len, feature_size}},
+        TensorDims{
+            FFOrdered<nonnegative_int>{batch_size, seq_len, feature_size}},
         DataType::FLOAT,
     };
 
diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc
index e18b7ea2de..58d6d9be6c 100644
--- a/lib/local-execution/test/src/test_task_registry.cc
+++ b/lib/local-execution/test/src/test_task_registry.cc
@@ -14,8 +14,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     TaskRegistry task_registry = empty_task_registry();
 
     layer_guid_t layer_guid = layer_guid_t{Node{0}};
-    int embed_dim = 32;
-    int num_heads = 10;
+    nonnegative_int embed_dim = 32_n;
+    nonnegative_int num_heads = 10_n;
     ComputationGraphOpAttrs attrs =
         ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
             /*embed_dim=*/embed_dim,
@@ -76,7 +76,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         CHECK(correct_task_mapping == task_registry.task_mapping);
       }
       SUBCASE("different attrs, still same task fn mapping") {
-        int embed_dim = 100;
+        nonnegative_int embed_dim = 100_n;
         layer_guid_t layer_3 = layer_guid_t{Node{3}};
         ComputationGraphOpAttrs other_attrs =
             ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
@@ -98,7 +98,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("equality") {
       TaskRegistry other_task_registry = empty_task_registry();
       SUBCASE("different attrs is still equal") {
-        int embed_dim = 100;
+        nonnegative_int embed_dim = 100_n;
         ComputationGraphOpAttrs other_attrs =
             ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
                 /*embed_dim=*/embed_dim,
diff --git a/lib/models/include/models/bert/bert_config.struct.toml b/lib/models/include/models/bert/bert_config.struct.toml
index 398210cf48..cc2a8eb0a7 100644
--- a/lib/models/include/models/bert/bert_config.struct.toml
+++ b/lib/models/include/models/bert/bert_config.struct.toml
@@ -12,27 +12,28 @@ features = [
 
 includes = [
   "op-attrs/activation.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
 name = "vocab_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "hidden_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_encoder_layers"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_heads"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "dim_feedforward"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "hidden_act"
@@ -64,8 +65,8 @@ type = "float"
 
 [[fields]]
 name = "sequence_length"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "batch_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/models/include/models/candle_uno/candle_uno_config.struct.toml b/lib/models/include/models/candle_uno/candle_uno_config.struct.toml
index 667a6531c3..e7d83efd07 100644
--- a/lib/models/include/models/candle_uno/candle_uno_config.struct.toml
+++ b/lib/models/include/models/candle_uno/candle_uno_config.struct.toml
@@ -14,6 +14,7 @@ includes = [
   "<vector>",
   "<map>",
   "<string>",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -25,19 +26,19 @@ src_includes = [
 
 [[fields]]
 name = "batch_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "dense_layers"
-type = "std::vector<int>"
+type = "std::vector<::FlexFlow::nonnegative_int>"
 
 [[fields]]
 name = "dense_feature_layers"
-type = "std::vector<int>"
+type = "std::vector<::FlexFlow::nonnegative_int>"
 
 [[fields]]
 name = "feature_shapes"
-type = "std::map<std::string, int>"
+type = "std::map<std::string, ::FlexFlow::nonnegative_int>"
 
 [[fields]]
 name = "input_features"
diff --git a/lib/models/include/models/inception_v3/inception_v3_config.struct.toml b/lib/models/include/models/inception_v3/inception_v3_config.struct.toml
index a2a75c83bb..1290420e16 100644
--- a/lib/models/include/models/inception_v3/inception_v3_config.struct.toml
+++ b/lib/models/include/models/inception_v3/inception_v3_config.struct.toml
@@ -10,13 +10,17 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "num_classes"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "batch_size"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "aux_logits"
diff --git a/lib/models/include/models/split_test/split_test.h b/lib/models/include/models/split_test/split_test.h
index b03e45b2d2..dd7089c4f6 100644
--- a/lib/models/include/models/split_test/split_test.h
+++ b/lib/models/include/models/split_test/split_test.h
@@ -12,7 +12,7 @@ namespace FlexFlow {
  * @note This is a tiny model developed for testing the original Unity
  * implementation. It is not a "real" model and has never been trained.
  */
-ComputationGraph get_split_test_computation_graph(int batch_size);
+ComputationGraph get_split_test_computation_graph(nonnegative_int batch_size);
 
 } // namespace FlexFlow
 
diff --git a/lib/models/include/models/transformer/transformer_config.struct.toml b/lib/models/include/models/transformer/transformer_config.struct.toml
index 23b0478dde..2a0b39feb9 100644
--- a/lib/models/include/models/transformer/transformer_config.struct.toml
+++ b/lib/models/include/models/transformer/transformer_config.struct.toml
@@ -1,6 +1,5 @@
 namespace = "FlexFlow"
 name = "TransformerConfig"
-
 features = [
   "eq",
   "ord",
@@ -10,33 +9,37 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "num_features"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "sequence_length"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "batch_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "dim_feedforward"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_heads"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_encoder_layers"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_decoder_layers"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "dropout"
@@ -48,4 +51,4 @@ type = "float"
 
 [[fields]]
 name = "vocab_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/models/src/models/bert/bert.cc b/lib/models/src/models/bert/bert.cc
index cf48f2399b..a5d63e8fdc 100644
--- a/lib/models/src/models/bert/bert.cc
+++ b/lib/models/src/models/bert/bert.cc
@@ -6,20 +6,22 @@
 namespace FlexFlow {
 
 BertConfig get_default_bert_config() {
-  return BertConfig{/*vocab_size=*/30522,
-                    /*hidden_size=*/768,
-                    /*num_encoder_layers=*/12,
-                    /*num_heads=*/12,
-                    /*dim_feedforward=*/3072,
-                    /*hidden_act=*/Activation::GELU,
-                    /*hidden_dropout_prob=*/0.1,
-                    /*attention_probs_dropout_prob=*/0.1,
-                    /*initializer_range=*/0.02,
-                    /*layer_norm_eps=*/1e-12,
-                    /*position_embedding_type=*/"absolute",
-                    /*classifier_dropout=*/0.1,
-                    /*sequence_length=*/512,
-                    /*batch_size=*/64};
+  return BertConfig{
+      /*vocab_size=*/30522_n,
+      /*hidden_size=*/768_n,
+      /*num_encoder_layers=*/12_n,
+      /*num_heads=*/12_n,
+      /*dim_feedforward=*/3072_n,
+      /*hidden_act=*/Activation::GELU,
+      /*hidden_dropout_prob=*/0.1,
+      /*attention_probs_dropout_prob=*/0.1,
+      /*initializer_range=*/0.02,
+      /*layer_norm_eps=*/1e-12,
+      /*position_embedding_type=*/"absolute",
+      /*classifier_dropout=*/0.1,
+      /*sequence_length=*/512_n,
+      /*batch_size=*/64_n,
+  };
 }
 
 tensor_guid_t
@@ -56,9 +58,10 @@ tensor_guid_t
                               InitializerAttrs const &bias_initializer,
                               InitializerAttrs const &projection_initializer) {
   assert(num_dims(cgb.get_shape(input)) == 3);
-  std::vector<int> layer_norm_axis = {2}; // Apply layernorm across the last dim
-  int kdim = config.dim_feedforward / config.num_heads;
-  int vdim = config.dim_feedforward / config.num_heads;
+  std::vector<relative_ff_dim_t> layer_norm_axis = {
+      relative_ff_dim_t{-1}}; // Apply layernorm across the last dim
+  nonnegative_int kdim = config.dim_feedforward / config.num_heads;
+  nonnegative_int vdim = config.dim_feedforward / config.num_heads;
   tensor_guid_t self_attention =
       cgb.multihead_attention(input,
                               input,
@@ -127,7 +130,7 @@ ComputationGraph get_bert_computation_graph(BertConfig const &config) {
   InitializerAttrs bias_initializer = InitializerAttrs{ZeroInitializerAttrs{}};
 
   TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<size_t>{
+      TensorDims{FFOrdered<nonnegative_int>{
           config.batch_size, config.sequence_length, config.hidden_size}},
       DataType::FLOAT,
   };
@@ -149,7 +152,7 @@ ComputationGraph get_bert_computation_graph(BertConfig const &config) {
   assert(
       (cgb.get_shape(out_prob) ==
        TensorShape{
-           TensorDims{FFOrdered<size_t>{
+           TensorDims{FFOrdered<nonnegative_int>{
                config.batch_size, config.sequence_length, config.vocab_size}},
            DataType::FLOAT,
        }));
diff --git a/lib/models/src/models/candle_uno/candle_uno.cc b/lib/models/src/models/candle_uno/candle_uno.cc
index 4d52d515fb..60422359a5 100644
--- a/lib/models/src/models/candle_uno/candle_uno.cc
+++ b/lib/models/src/models/candle_uno/candle_uno.cc
@@ -1,32 +1,34 @@
 #include "models/candle_uno/candle_uno.h"
 #include "pcg/initializers/glorot_normal_attrs.dtg.h"
+#include "utils/containers/repeat_element.h"
 
 namespace FlexFlow {
 
 CandleUnoConfig get_default_candle_uno_config() {
-  CandleUnoConfig config{
-      /*batch_size=*/64,
-      /*dense_layers=*/std::vector<int>(4, 4192),
-      /*dense_feature_layers=*/std::vector<int>(8, 4192),
-      /*feature_shapes=*/std::map<std::string, int>{},
-      /*input_features=*/std::map<std::string, std::string>{},
+  return CandleUnoConfig{
+      /*batch_size=*/64_n,
+      /*dense_layers=*/repeat_element(/*num_times=*/4_n, /*element=*/4192_n),
+      /*dense_feature_layers=*/
+      repeat_element(/*num_times=*/8_n, /*element=*/4192_n),
+      /*feature_shapes=*/
+      {
+          {"dose", 1_n},
+          {"cell.rnaseq", 942_n},
+          {"drug.descriptors", 5270_n},
+          {"drug.fingerprints", 2048_n},
+      },
+      /*input_features=*/
+      {
+          {"dose1", "dose"},
+          {"dose2", "dose"},
+          {"cell.rnaseq", "cell.rnaseq"},
+          {"drug1.descriptors", "drug.descriptors"},
+          {"drug1.fingerprints", "drug.fingerprints"},
+          {"drug2.descriptors", "drug.descriptors"},
+          {"drug2.fingerprints", "drug.fingerprints"},
+      },
       /*dropout=*/0.1,
       /*residual=*/false};
-
-  config.feature_shapes["dose"] = 1;
-  config.feature_shapes["cell.rnaseq"] = 942;
-  config.feature_shapes["drug.descriptors"] = 5270;
-  config.feature_shapes["drug.fingerprints"] = 2048;
-
-  config.input_features["dose1"] = "dose";
-  config.input_features["dose2"] = "dose";
-  config.input_features["cell.rnaseq"] = "cell.rnaseq";
-  config.input_features["drug1.descriptors"] = "drug.descriptors";
-  config.input_features["drug1.fingerprints"] = "drug.fingerprints";
-  config.input_features["drug2.descriptors"] = "drug.descriptors";
-  config.input_features["drug2.fingerprints"] = "drug.fingerprints";
-
-  return config;
 }
 
 tensor_guid_t create_candle_uno_feature_model(
@@ -35,7 +37,7 @@ tensor_guid_t create_candle_uno_feature_model(
     tensor_guid_t const &input,
     InitializerAttrs const &kernel_initializer) {
   tensor_guid_t t = input;
-  for (int const dense_dim : config.dense_feature_layers) {
+  for (nonnegative_int dense_dim : config.dense_feature_layers) {
     t = cgb.dense(t,
                   dense_dim,
                   Activation::RELU,
@@ -56,7 +58,7 @@ ComputationGraph
       InitializerAttrs{GlorotNormalAttrs{/*seed=*/0}};
 
   auto create_input_tensor =
-      [&](FFOrdered<size_t> const &dims) -> tensor_guid_t {
+      [&](FFOrdered<nonnegative_int> const &dims) -> tensor_guid_t {
     TensorShape input_shape = TensorShape{
         TensorDims{dims},
         DataType::FLOAT,
@@ -82,7 +84,7 @@ ComputationGraph
 
   for (auto const &input_feature : config.input_features) {
     std::string const &feature_name = input_feature.second;
-    size_t shape = config.feature_shapes.at(feature_name);
+    nonnegative_int shape = config.feature_shapes.at(feature_name);
     tensor_guid_t input = create_input_tensor({config.batch_size, shape});
     all_inputs.push_back(input);
 
@@ -94,8 +96,9 @@ ComputationGraph
     }
   }
 
-  tensor_guid_t output = cgb.concat(encoded_inputs, /*axis=*/1);
-  for (int const &dense_layer_dim : config.dense_layers) {
+  tensor_guid_t output =
+      cgb.concat(encoded_inputs, /*axis=*/relative_ff_dim_t{1});
+  for (nonnegative_int dense_layer_dim : config.dense_layers) {
     tensor_guid_t residual_input = output;
     output = cgb.dense(output,
                        dense_layer_dim,
@@ -111,7 +114,7 @@ ComputationGraph
     }
   }
   output = cgb.dense(output,
-                     /*outDim=*/1,
+                     /*outDim=*/1_n,
                      /*activation=*/std::nullopt,
                      /*use_bias=*/false,
                      /*data_type=*/DataType::FLOAT,
diff --git a/lib/models/src/models/inception_v3/inception_v3.cc b/lib/models/src/models/inception_v3/inception_v3.cc
index f540eae629..3a829f3754 100644
--- a/lib/models/src/models/inception_v3/inception_v3.cc
+++ b/lib/models/src/models/inception_v3/inception_v3.cc
@@ -15,14 +15,17 @@ struct CheckShape {
   ComputationGraphBuilder const &cgb;
   InceptionV3Config const &config;
 
-  void operator()(tensor_guid_t t, int c, int h, int w) const {
+  void operator()(tensor_guid_t t,
+                  nonnegative_int c,
+                  nonnegative_int h,
+                  nonnegative_int w) const {
     TensorShape current_shape = cgb.get_shape(t);
     TensorShape expected_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            size_t_from_int(config.batch_size),
-            size_t_from_int(c),
-            size_t_from_int(h),
-            size_t_from_int(w),
+        TensorDims{FFOrdered<nonnegative_int>{
+            config.batch_size,
+            c,
+            h,
+            w,
         }},
         DataType::FLOAT,
     };
@@ -35,12 +38,12 @@ struct CheckShape {
     }
   }
 
-  void operator()(tensor_guid_t t, int c) const {
+  void operator()(tensor_guid_t t, nonnegative_int c) const {
     TensorShape current_shape = cgb.get_shape(t);
     TensorShape expected_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            size_t_from_int(config.batch_size),
-            size_t_from_int(c),
+        TensorDims{FFOrdered<nonnegative_int>{
+            config.batch_size,
+            c,
         }},
         DataType::FLOAT,
     };
@@ -56,11 +59,11 @@ struct CheckShape {
 
 InceptionV3Config get_default_inception_v3_training_config() {
   return InceptionV3Config{
-      /*num_classes=*/1000,
+      /*num_classes=*/1000_n,
 
       // see section 8 of https://arxiv.org/abs/1512.00567 for the source of the
       // batch size
-      /*batch_size=*/32,
+      /*batch_size=*/32_n,
 
       // see section 4 of https://arxiv.org/abs/1512.00567 for a discussion of
       // auxiliary logits. they are used by default in training
@@ -70,13 +73,13 @@ InceptionV3Config get_default_inception_v3_training_config() {
 
 static tensor_guid_t create_conv_block(ComputationGraphBuilder &cgb,
                                        tensor_guid_t const &input,
-                                       int filters,
-                                       int kernel_size_h,
-                                       int kernel_size_w,
-                                       int stride_h = 1,
-                                       int stride_w = 1,
-                                       int padding_h = 0,
-                                       int padding_w = 0,
+                                       nonnegative_int filters,
+                                       nonnegative_int kernel_size_h,
+                                       nonnegative_int kernel_size_w,
+                                       nonnegative_int stride_h = 1_n,
+                                       nonnegative_int stride_w = 1_n,
+                                       nonnegative_int padding_h = 0_n,
+                                       nonnegative_int padding_w = 0_n,
                                        bool use_bias = false) {
   tensor_guid_t conv = cgb.conv2d(input,
                                   /*outChannels=*/filters,
@@ -87,7 +90,7 @@ static tensor_guid_t create_conv_block(ComputationGraphBuilder &cgb,
                                   /*paddingH=*/padding_h,
                                   /*paddingW=*/padding_w,
                                   /*activation=*/std::nullopt,
-                                  /*groups=*/1,
+                                  /*groups=*/1_n,
                                   /*use_bias=*/use_bias);
   return cgb.batch_norm(conv,
                         /*affine=*/true,
@@ -98,29 +101,29 @@ static tensor_guid_t create_conv_block(ComputationGraphBuilder &cgb,
 
 static tensor_guid_t create_inception_module_a(ComputationGraphBuilder &cgb,
                                                tensor_guid_t const &input,
-                                               int pool_features) {
+                                               nonnegative_int pool_features) {
   tensor_guid_t branch1x1 = create_conv_block(cgb,
                                               input,
-                                              /*filters=*/64,
-                                              /*kernel_size_h=*/1,
-                                              /*kernel_size_w=*/1);
+                                              /*filters=*/64_n,
+                                              /*kernel_size_h=*/1_n,
+                                              /*kernel_size_w=*/1_n);
 
   tensor_guid_t branch5x5 = [&] {
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/48,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/48_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/64,
-                          /*kernel_size_h=*/5,
-                          /*kernel_size_w=*/5,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/2,
-                          /*padding_w=*/2);
+                          /*filters=*/64_n,
+                          /*kernel_size_h=*/5_n,
+                          /*kernel_size_w=*/5_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/2_n,
+                          /*padding_w=*/2_n);
     return t;
   }();
 
@@ -128,208 +131,209 @@ static tensor_guid_t create_inception_module_a(ComputationGraphBuilder &cgb,
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/64,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/64_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/96,
-                          /*kernel_size_h=*/3,
-                          /*kernel_size_w=*/3,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/1,
-                          /*padding_w=*/1);
+                          /*filters=*/96_n,
+                          /*kernel_size_h=*/3_n,
+                          /*kernel_size_w=*/3_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/1_n,
+                          /*padding_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/96,
-                          /*kernel_size_h=*/3,
-                          /*kernel_size_w=*/3,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/1,
-                          /*padding_w=*/1);
+                          /*filters=*/96_n,
+                          /*kernel_size_h=*/3_n,
+                          /*kernel_size_w=*/3_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/1_n,
+                          /*padding_w=*/1_n);
     return t;
   }();
 
   tensor_guid_t branch_pool = [&] {
     tensor_guid_t t = input;
     t = cgb.pool2d(t,
-                   /*kernelH=*/3,
-                   /*kernelW=*/3,
-                   /*strideH=*/1,
-                   /*strideW=*/1,
-                   /*paddingH=*/1,
-                   /*paddingW=*/1,
+                   /*kernelH=*/3_n,
+                   /*kernelW=*/3_n,
+                   /*strideH=*/1_n,
+                   /*strideW=*/1_n,
+                   /*paddingH=*/1_n,
+                   /*paddingW=*/1_n,
                    /*type=*/PoolOp::AVG);
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/pool_features,
-                          /*kernel_stride_h=*/1,
-                          /*kernel_stride_w=*/1);
+                          /*kernel_stride_h=*/1_n,
+                          /*kernel_stride_w=*/1_n);
     return t;
   }();
 
   return cgb.concat({branch1x1, branch5x5, branch3x3dbl, branch_pool},
-                    /*axis=*/1);
+                    /*axis=*/relative_ff_dim_t{1});
 }
 
 static tensor_guid_t create_inception_module_b(ComputationGraphBuilder &cgb,
                                                tensor_guid_t const &input) {
   tensor_guid_t branch3x3 = create_conv_block(cgb,
                                               input,
-                                              /*filters=*/384,
-                                              /*kernel_size_h=*/3,
-                                              /*kernel_size_w=*/3,
-                                              /*stride_h=*/2,
-                                              /*stride_w=*/2);
+                                              /*filters=*/384_n,
+                                              /*kernel_size_h=*/3_n,
+                                              /*kernel_size_w=*/3_n,
+                                              /*stride_h=*/2_n,
+                                              /*stride_w=*/2_n);
 
   tensor_guid_t branch3x3dbl = [&] {
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/64,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/64_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/96,
-                          /*kernel_size_h=*/3,
-                          /*kernel_size_w=*/3,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/1,
-                          /*padding_w=*/1);
+                          /*filters=*/96_n,
+                          /*kernel_size_h=*/3_n,
+                          /*kernel_size_w=*/3_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/1_n,
+                          /*padding_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/96,
-                          /*kernel_stride_h=*/3,
-                          /*kernel_stride_w=*/3,
-                          /*stride_h=*/2,
-                          /*stride_w=*/2);
+                          /*filters=*/96_n,
+                          /*kernel_stride_h=*/3_n,
+                          /*kernel_stride_w=*/3_n,
+                          /*stride_h=*/2_n,
+                          /*stride_w=*/2_n);
     return t;
   }();
 
   tensor_guid_t branch_pool = cgb.pool2d(input,
-                                         /*kernelH=*/3,
-                                         /*kernelW=*/3,
-                                         /*strideH=*/2,
-                                         /*strideW=*/2,
-                                         /*paddingH=*/0,
-                                         /*paddingW=*/0,
+                                         /*kernelH=*/3_n,
+                                         /*kernelW=*/3_n,
+                                         /*strideH=*/2_n,
+                                         /*strideW=*/2_n,
+                                         /*paddingH=*/0_n,
+                                         /*paddingW=*/0_n,
                                          /*type=*/PoolOp::MAX);
 
-  return cgb.concat({branch3x3, branch3x3dbl, branch_pool}, /*axis=*/1);
+  return cgb.concat({branch3x3, branch3x3dbl, branch_pool},
+                    /*axis=*/relative_ff_dim_t{1});
 }
 
 static tensor_guid_t create_inception_module_c(ComputationGraphBuilder &cgb,
                                                CheckShape const &check_shape,
                                                tensor_guid_t const &input,
-                                               int channels_7x7) {
+                                               nonnegative_int channels_7x7) {
   tensor_guid_t branch1x1 = create_conv_block(cgb,
                                               input,
-                                              /*filters=*/192,
-                                              /*kernel_size_h=*/1,
-                                              /*kernel_size_w=*/1);
-  check_shape(branch1x1, 192, 17, 17);
+                                              /*filters=*/192_n,
+                                              /*kernel_size_h=*/1_n,
+                                              /*kernel_size_w=*/1_n);
+  check_shape(branch1x1, 192_n, 17_n, 17_n);
 
   tensor_guid_t branch7x7 = [&] {
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/7,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/0,
-                          /*padding_w=*/3);
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/7_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/0_n,
+                          /*padding_w=*/3_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/7,
-                          /*kernel_size_w=*/1,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/3,
-                          /*padding_w=*/0);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/7_n,
+                          /*kernel_size_w=*/1_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/3_n,
+                          /*padding_w=*/0_n);
     return t;
   }();
-  check_shape(branch7x7, 192, 17, 17);
+  check_shape(branch7x7, 192_n, 17_n, 17_n);
 
   tensor_guid_t branch7x7dbl = [&] {
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/7,
-                          /*kernel_size_w=*/1,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/3,
-                          /*padding_w=*/0);
+                          /*kernel_size_h=*/7_n,
+                          /*kernel_size_w=*/1_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/3_n,
+                          /*padding_w=*/0_n);
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/7,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/0,
-                          /*padding_w=*/3);
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/7_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/0_n,
+                          /*padding_w=*/3_n);
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/7,
-                          /*kernel_size_w=*/1,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/3,
-                          /*padding_w=*/0);
+                          /*kernel_size_h=*/7_n,
+                          /*kernel_size_w=*/1_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/3_n,
+                          /*padding_w=*/0_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/7,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/0,
-                          /*padding_w=*/3);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/7_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/0_n,
+                          /*padding_w=*/3_n);
     return t;
   }();
-  check_shape(branch7x7dbl, 192, 17, 17);
+  check_shape(branch7x7dbl, 192_n, 17_n, 17_n);
 
   tensor_guid_t branch_pool = [&] {
     tensor_guid_t t = input;
     t = cgb.pool2d(t,
-                   /*kernelH=*/3,
-                   /*kernelW=*/3,
-                   /*strideH=*/1,
-                   /*strideW=*/1,
-                   /*paddingH=*/1,
-                   /*paddingW=*/1,
+                   /*kernelH=*/3_n,
+                   /*kernelW=*/3_n,
+                   /*strideH=*/1_n,
+                   /*strideW=*/1_n,
+                   /*paddingH=*/1_n,
+                   /*paddingW=*/1_n,
                    /*type=*/PoolOp::AVG);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     return t;
   }();
-  check_shape(branch_pool, 192, 17, 17);
+  check_shape(branch_pool, 192_n, 17_n, 17_n);
 
   return cgb.concat({branch1x1, branch7x7, branch7x7dbl, branch_pool},
-                    /*axis=*/1);
+                    /*axis=*/relative_ff_dim_t{1});
 }
 
 static tensor_guid_t create_inception_module_d(ComputationGraphBuilder &cgb,
@@ -338,10 +342,10 @@ static tensor_guid_t create_inception_module_d(ComputationGraphBuilder &cgb,
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
-    t = create_conv_block(cgb, t, 320, 3, 3, 2, 2);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
+    t = create_conv_block(cgb, t, 320_n, 3_n, 3_n, 2_n, 2_n);
     return t;
   }();
 
@@ -349,83 +353,84 @@ static tensor_guid_t create_inception_module_d(ComputationGraphBuilder &cgb,
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/7,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/0,
-                          /*padding_w=*/3);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/7_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/0_n,
+                          /*padding_w=*/3_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/7,
-                          /*kernel_size_w=*/1,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/3,
-                          /*padding_w=*/0);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/7_n,
+                          /*kernel_size_w=*/1_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/3_n,
+                          /*padding_w=*/0_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/3,
-                          /*kernel_size_w=*/3,
-                          /*stride_h=*/2,
-                          /*stride_w=*/2);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/3_n,
+                          /*kernel_size_w=*/3_n,
+                          /*stride_h=*/2_n,
+                          /*stride_w=*/2_n);
     return t;
   }();
 
   tensor_guid_t branch_pool = cgb.pool2d(input,
-                                         /*kernelH=*/3,
-                                         /*kernelW=*/3,
-                                         /*strideH=*/2,
-                                         /*strideW=*/2,
-                                         /*paddingH=*/0,
-                                         /*paddingW=*/0,
+                                         /*kernelH=*/3_n,
+                                         /*kernelW=*/3_n,
+                                         /*strideH=*/2_n,
+                                         /*strideW=*/2_n,
+                                         /*paddingH=*/0_n,
+                                         /*paddingW=*/0_n,
                                          /*type=*/PoolOp::MAX);
 
-  return cgb.concat({branch3x3, branch7x7x3, branch_pool}, /*axis=*/1);
+  return cgb.concat({branch3x3, branch7x7x3, branch_pool},
+                    /*axis=*/relative_ff_dim_t{1});
 }
 
 static tensor_guid_t create_inception_module_e(ComputationGraphBuilder &cgb,
                                                tensor_guid_t const &input) {
   tensor_guid_t branch1x1 = create_conv_block(cgb,
                                               input,
-                                              /*filters=*/320,
-                                              /*kernel_size_h=*/1,
-                                              /*kernel_size_w=*/1);
+                                              /*filters=*/320_n,
+                                              /*kernel_size_h=*/1_n,
+                                              /*kernel_size_w=*/1_n);
 
   tensor_guid_t branch3x3 = [&] {
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/384,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/384_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     tensor_guid_t t_1 = create_conv_block(cgb,
                                           t,
-                                          /*filters=*/384,
-                                          /*kernel_size_h=*/1,
-                                          /*kernel_size_w=*/3,
-                                          /*stride_h=*/1,
-                                          /*stride_w=*/1,
-                                          /*padding_h=*/0,
-                                          /*padding_w=*/1);
+                                          /*filters=*/384_n,
+                                          /*kernel_size_h=*/1_n,
+                                          /*kernel_size_w=*/3_n,
+                                          /*stride_h=*/1_n,
+                                          /*stride_w=*/1_n,
+                                          /*padding_h=*/0_n,
+                                          /*padding_w=*/1_n);
     tensor_guid_t t_2 = create_conv_block(cgb,
                                           t,
-                                          /*filters=*/384,
-                                          /*kernel_size_h=*/3,
-                                          /*kernel_size_w=*/1,
-                                          /*stride_h=*/1,
-                                          /*stride_w=*/1,
-                                          /*padding_h=*/1,
-                                          /*padding_w=*/0);
-    t = cgb.concat({t_1, t_2}, /*axis=*/1);
+                                          /*filters=*/384_n,
+                                          /*kernel_size_h=*/3_n,
+                                          /*kernel_size_w=*/1_n,
+                                          /*stride_h=*/1_n,
+                                          /*stride_w=*/1_n,
+                                          /*padding_h=*/1_n,
+                                          /*padding_w=*/0_n);
+    t = cgb.concat({t_1, t_2}, /*axis=*/relative_ff_dim_t{1});
     return t;
   }();
 
@@ -433,60 +438,60 @@ static tensor_guid_t create_inception_module_e(ComputationGraphBuilder &cgb,
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/448,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/448_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/384,
-                          /*kernel_size_h=*/3,
-                          /*kernel_size_w=*/3,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/1,
-                          /*padding_w=*/1);
+                          /*filters=*/384_n,
+                          /*kernel_size_h=*/3_n,
+                          /*kernel_size_w=*/3_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/1_n,
+                          /*padding_w=*/1_n);
     tensor_guid_t t_1 = create_conv_block(cgb,
                                           t,
-                                          /*filters=*/384,
-                                          /*kernel_size_h=*/1,
-                                          /*kernel_size_w=*/3,
-                                          /*stride_h=*/1,
-                                          /*stride_w=*/1,
-                                          /*padding_h=*/0,
-                                          /*padding_w=*/1);
+                                          /*filters=*/384_n,
+                                          /*kernel_size_h=*/1_n,
+                                          /*kernel_size_w=*/3_n,
+                                          /*stride_h=*/1_n,
+                                          /*stride_w=*/1_n,
+                                          /*padding_h=*/0_n,
+                                          /*padding_w=*/1_n);
     tensor_guid_t t_2 = create_conv_block(cgb,
                                           t,
-                                          /*filters=*/384,
-                                          /*kernel_size_h=*/3,
-                                          /*kernel_size_w=*/1,
-                                          /*stride_h=*/1,
-                                          /*stride_w=*/1,
-                                          /*padding_h=*/1,
-                                          /*padding_w=*/0);
-    t = cgb.concat({t_1, t_2}, /*axis=*/1);
+                                          /*filters=*/384_n,
+                                          /*kernel_size_h=*/3_n,
+                                          /*kernel_size_w=*/1_n,
+                                          /*stride_h=*/1_n,
+                                          /*stride_w=*/1_n,
+                                          /*padding_h=*/1_n,
+                                          /*padding_w=*/0_n);
+    t = cgb.concat({t_1, t_2}, /*axis=*/relative_ff_dim_t{1});
     return t;
   }();
 
   tensor_guid_t branch_pool = [&] {
     tensor_guid_t t = input;
     t = cgb.pool2d(t,
-                   /*kernelH=*/3,
-                   /*kernelW=*/3,
-                   /*strideH=*/1,
-                   /*strideW=*/1,
-                   /*paddingH=*/1,
-                   /*paddingW=*/1,
+                   /*kernelH=*/3_n,
+                   /*kernelW=*/3_n,
+                   /*strideH=*/1_n,
+                   /*strideW=*/1_n,
+                   /*paddingH=*/1_n,
+                   /*paddingW=*/1_n,
                    /*type=*/PoolOp::AVG);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     return t;
   }();
 
   return cgb.concat({branch1x1, branch3x3, branch3x3dbl, branch_pool},
-                    /*axis=*/1);
+                    /*axis=*/relative_ff_dim_t{1});
 }
 
 static tensor_guid_t create_initial_layers(ComputationGraphBuilder &cgb,
@@ -494,75 +499,75 @@ static tensor_guid_t create_initial_layers(ComputationGraphBuilder &cgb,
                                            tensor_guid_t const &input) {
   tensor_guid_t t = input;
 
-  check_shape(t, 3, 299, 299);
+  check_shape(t, 3_n, 299_n, 299_n);
 
   // Conv2d_1a_3x3
   t = create_conv_block(cgb,
                         t,
-                        /*filters=*/32,
-                        /*kernel_size_h=*/3,
-                        /*kernel_size_w=*/3,
-                        /*stride_h=*/2,
-                        /*stride_w=*/2);
-  check_shape(t, 32, 149, 149);
+                        /*filters=*/32_n,
+                        /*kernel_size_h=*/3_n,
+                        /*kernel_size_w=*/3_n,
+                        /*stride_h=*/2_n,
+                        /*stride_w=*/2_n);
+  check_shape(t, 32_n, 149_n, 149_n);
 
   // Conv2d_2a_3x3
   t = create_conv_block(cgb,
                         t,
-                        /*filters=*/32,
-                        /*kernel_size_h=*/3,
-                        /*kernel_size_w=*/3);
-  check_shape(t, 32, 147, 147);
+                        /*filters=*/32_n,
+                        /*kernel_size_h=*/3_n,
+                        /*kernel_size_w=*/3_n);
+  check_shape(t, 32_n, 147_n, 147_n);
 
   // Conv2d_2b_3x3
   t = create_conv_block(cgb,
                         t,
-                        /*filters=*/64,
-                        /*kernel_size_h=*/3,
-                        /*kernel_size_w=*/3,
-                        /*stride_h=*/1,
-                        /*stride_w=*/1,
-                        /*padding_h=*/1,
-                        /*padding_w=*/1);
-  check_shape(t, 64, 147, 147);
+                        /*filters=*/64_n,
+                        /*kernel_size_h=*/3_n,
+                        /*kernel_size_w=*/3_n,
+                        /*stride_h=*/1_n,
+                        /*stride_w=*/1_n,
+                        /*padding_h=*/1_n,
+                        /*padding_w=*/1_n);
+  check_shape(t, 64_n, 147_n, 147_n);
 
   // maxpool1
   t = cgb.pool2d(t,
-                 /*kernelH=*/3,
-                 /*kernelW=*/3,
-                 /*strideH=*/2,
-                 /*strideW=*/2,
-                 /*paddingH=*/0,
-                 /*paddingW=*/0,
+                 /*kernelH=*/3_n,
+                 /*kernelW=*/3_n,
+                 /*strideH=*/2_n,
+                 /*strideW=*/2_n,
+                 /*paddingH=*/0_n,
+                 /*paddingW=*/0_n,
                  /*type=*/PoolOp::MAX);
-  check_shape(t, 64, 73, 73);
+  check_shape(t, 64_n, 73_n, 73_n);
 
   // Conv2d_3b_1x1
   t = create_conv_block(cgb,
                         t,
-                        /*filters=*/80,
-                        /*kernel_size_h=*/1,
-                        /*kernel_size_w=*/1);
-  check_shape(t, 80, 73, 73);
+                        /*filters=*/80_n,
+                        /*kernel_size_h=*/1_n,
+                        /*kernel_size_w=*/1_n);
+  check_shape(t, 80_n, 73_n, 73_n);
 
   // Conv2d_4a_3x3
   t = create_conv_block(cgb,
                         t,
-                        /*filters=*/192,
-                        /*kernel_size_h=*/3,
-                        /*kernel_size_w=*/3);
-  check_shape(t, 192, 71, 71);
+                        /*filters=*/192_n,
+                        /*kernel_size_h=*/3_n,
+                        /*kernel_size_w=*/3_n);
+  check_shape(t, 192_n, 71_n, 71_n);
 
   // maxpool2
   t = cgb.pool2d(t,
-                 /*kernelH=*/3,
-                 /*kernelW=*/3,
-                 /*strideH=*/2,
-                 /*strideW=*/2,
-                 /*paddingH=*/0,
-                 /*paddingW=*/0,
+                 /*kernelH=*/3_n,
+                 /*kernelW=*/3_n,
+                 /*strideH=*/2_n,
+                 /*strideW=*/2_n,
+                 /*paddingH=*/0_n,
+                 /*paddingW=*/0_n,
                  /*type=*/PoolOp::MAX);
-  check_shape(t, 192, 35, 35);
+  check_shape(t, 192_n, 35_n, 35_n);
 
   return t;
 }
@@ -570,26 +575,26 @@ static tensor_guid_t create_initial_layers(ComputationGraphBuilder &cgb,
 static tensor_guid_t create_final_layers(ComputationGraphBuilder &cgb,
                                          CheckShape const &check_shape,
                                          tensor_guid_t const &input,
-                                         size_t num_classes) {
+                                         nonnegative_int num_classes) {
   // avgpool
   tensor_guid_t x = cgb.pool2d(input,
-                               /*kernelH=*/8,
-                               /*kernelW=*/8,
-                               /*strideH=*/1,
-                               /*strideW=*/1,
-                               /*paddingH=*/0,
-                               /*paddingW=*/0,
+                               /*kernelH=*/8_n,
+                               /*kernelW=*/8_n,
+                               /*strideH=*/1_n,
+                               /*strideW=*/1_n,
+                               /*paddingH=*/0_n,
+                               /*paddingW=*/0_n,
                                /*type=*/PoolOp::AVG);
-  check_shape(x, 2048, 1, 1);
+  check_shape(x, 2048_n, 1_n, 1_n);
 
   // dropout
   x = cgb.dropout(x,
                   /*rate=*/0.5);
-  check_shape(x, 2048, 1, 1);
+  check_shape(x, 2048_n, 1_n, 1_n);
 
   x = cgb.flat(x,
-               /*start_dim=*/1);
-  check_shape(x, 2048);
+               /*start_dim=*/relative_ff_dim_t{1});
+  check_shape(x, 2048_n);
 
   // fc
   x = cgb.dense(x,
@@ -597,7 +602,7 @@ static tensor_guid_t create_final_layers(ComputationGraphBuilder &cgb,
   check_shape(x, num_classes);
 
   // softmax (not in pytorch model, but shown in Table 1 on p6 of
-  // https://arxiv.org/abs/1512.00567)
+  // https://arxiv.org/abs/1512.00567_n)
   x = cgb.softmax(x);
   check_shape(x, num_classes);
 
@@ -607,44 +612,44 @@ static tensor_guid_t create_final_layers(ComputationGraphBuilder &cgb,
 static tensor_guid_t create_inception_aux(ComputationGraphBuilder &cgb,
                                           CheckShape const &check_shape,
                                           tensor_guid_t const &input,
-                                          size_t num_classes) {
+                                          nonnegative_int num_classes) {
   tensor_guid_t x = input;
-  check_shape(x, 768, 17, 17);
+  check_shape(x, 768_n, 17_n, 17_n);
 
   x = cgb.pool2d(x,
-                 /*kernelH=*/5,
-                 /*kernelW=*/5,
-                 /*strideH=*/3,
-                 /*strideW=*/3,
-                 /*paddingH=*/0,
-                 /*paddingW=*/0,
+                 /*kernelH=*/5_n,
+                 /*kernelW=*/5_n,
+                 /*strideH=*/3_n,
+                 /*strideW=*/3_n,
+                 /*paddingH=*/0_n,
+                 /*paddingW=*/0_n,
                  /*type=*/PoolOp::AVG);
-  check_shape(x, 768, 5, 5);
+  check_shape(x, 768_n, 5_n, 5_n);
 
   // conv0
   x = create_conv_block(cgb,
                         x,
-                        /*filters=*/128,
-                        /*kernel_size_h=*/1,
-                        /*kernel_size_w=*/1);
-  check_shape(x, 128, 5, 5);
+                        /*filters=*/128_n,
+                        /*kernel_size_h=*/1_n,
+                        /*kernel_size_w=*/1_n);
+  check_shape(x, 128_n, 5_n, 5_n);
 
   // conv1
   x = create_conv_block(cgb,
                         x,
-                        /*filters=*/768,
-                        /*kernel_size_h=*/5,
-                        /*kernel_size_w=*/5);
-  check_shape(x, 768, 1, 1);
+                        /*filters=*/768_n,
+                        /*kernel_size_h=*/5_n,
+                        /*kernel_size_w=*/5_n);
+  check_shape(x, 768_n, 1_n, 1_n);
 
   x = cgb.adaptive_pool2d(x,
-                          /*output_h=*/1,
-                          /*output_w=*/1);
-  check_shape(x, 768, 1, 1);
+                          /*output_h=*/1_n,
+                          /*output_w=*/1_n);
+  check_shape(x, 768_n, 1_n, 1_n);
 
   x = cgb.flat(x,
-               /*start_dim=*/1);
-  check_shape(x, 768);
+               /*start_dim=*/relative_ff_dim_t{1});
+  check_shape(x, 768_n);
 
   // fc
   x = cgb.dense(x,
@@ -666,39 +671,39 @@ static InceptionV3Output create_inception_v3(ComputationGraphBuilder &cgb,
   };
 
   tensor_guid_t x = create_initial_layers(cgb, check_shape, input);
-  check_shape(x, 192, 35, 35);
+  check_shape(x, 192_n, 35_n, 35_n);
 
   // Mixed_5b
-  x = create_inception_module_a(cgb, x, 32);
-  check_shape(x, 256, 35, 35);
+  x = create_inception_module_a(cgb, x, 32_n);
+  check_shape(x, 256_n, 35_n, 35_n);
 
   // Mixed_5c
-  x = create_inception_module_a(cgb, x, 64);
-  check_shape(x, 288, 35, 35);
+  x = create_inception_module_a(cgb, x, 64_n);
+  check_shape(x, 288_n, 35_n, 35_n);
 
   // Mixed_5d
-  x = create_inception_module_a(cgb, x, 64);
-  check_shape(x, 288, 35, 35);
+  x = create_inception_module_a(cgb, x, 64_n);
+  check_shape(x, 288_n, 35_n, 35_n);
 
   // Mixed_6a
   x = create_inception_module_b(cgb, x);
-  check_shape(x, 768, 17, 17);
+  check_shape(x, 768_n, 17_n, 17_n);
 
   // Mixed_6b
-  x = create_inception_module_c(cgb, check_shape, x, 128);
-  check_shape(x, 768, 17, 17);
+  x = create_inception_module_c(cgb, check_shape, x, 128_n);
+  check_shape(x, 768_n, 17_n, 17_n);
 
   // Mixed_6c
-  x = create_inception_module_c(cgb, check_shape, x, 160);
-  check_shape(x, 768, 17, 17);
+  x = create_inception_module_c(cgb, check_shape, x, 160_n);
+  check_shape(x, 768_n, 17_n, 17_n);
 
   // Mixed_6d
-  x = create_inception_module_c(cgb, check_shape, x, 160);
-  check_shape(x, 768, 17, 17);
+  x = create_inception_module_c(cgb, check_shape, x, 160_n);
+  check_shape(x, 768_n, 17_n, 17_n);
 
   // Mixed_6e
-  x = create_inception_module_c(cgb, check_shape, x, 192);
-  check_shape(x, 768, 17, 17);
+  x = create_inception_module_c(cgb, check_shape, x, 192_n);
+  check_shape(x, 768_n, 17_n, 17_n);
 
   std::optional<tensor_guid_t> aux;
   if (config.aux_logits) {
@@ -708,15 +713,15 @@ static InceptionV3Output create_inception_v3(ComputationGraphBuilder &cgb,
 
   // Mixed_7a
   x = create_inception_module_d(cgb, x);
-  check_shape(x, 1280, 8, 8);
+  check_shape(x, 1280_n, 8_n, 8_n);
 
   // Mixed_7b
   x = create_inception_module_e(cgb, x);
-  check_shape(x, 2048, 8, 8);
+  check_shape(x, 2048_n, 8_n, 8_n);
 
   // Mixed_7c
   x = create_inception_module_e(cgb, x);
-  check_shape(x, 2048, 8, 8);
+  check_shape(x, 2048_n, 8_n, 8_n);
 
   x = create_final_layers(cgb, check_shape, x, config.num_classes);
   check_shape(x, config.num_classes);
@@ -732,11 +737,11 @@ ComputationGraph
   ComputationGraphBuilder cgb;
 
   TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<size_t>{
-          size_t_from_int(config.batch_size),
-          3,
-          299,
-          299,
+      TensorDims{FFOrdered<nonnegative_int>{
+          config.batch_size,
+          3_n,
+          299_n,
+          299_n,
       }},
       DataType::FLOAT,
   };
diff --git a/lib/models/src/models/split_test/split_test.cc b/lib/models/src/models/split_test/split_test.cc
index 118f94ec06..d3876d8bfc 100644
--- a/lib/models/src/models/split_test/split_test.cc
+++ b/lib/models/src/models/split_test/split_test.cc
@@ -4,18 +4,18 @@
 
 namespace FlexFlow {
 
-ComputationGraph get_split_test_computation_graph(int batch_size) {
+ComputationGraph get_split_test_computation_graph(nonnegative_int batch_size) {
   ComputationGraphBuilder cgb;
 
-  int layer_dim1 = 256;
-  int layer_dim2 = 128;
-  int layer_dim3 = 64;
-  int layer_dim4 = 32;
+  nonnegative_int layer_dim1 = 256_n;
+  nonnegative_int layer_dim2 = 128_n;
+  nonnegative_int layer_dim3 = 64_n;
+  nonnegative_int layer_dim4 = 32_n;
 
   TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<size_t>{
-          size_t_from_int(batch_size),
-          size_t_from_int(layer_dim1),
+      TensorDims{FFOrdered<nonnegative_int>{
+          batch_size,
+          layer_dim1,
       }},
       DataType::FLOAT,
   };
diff --git a/lib/models/src/models/transformer/transformer.cc b/lib/models/src/models/transformer/transformer.cc
index 173a1b291c..f71763313a 100644
--- a/lib/models/src/models/transformer/transformer.cc
+++ b/lib/models/src/models/transformer/transformer.cc
@@ -4,16 +4,16 @@
 namespace FlexFlow {
 
 TransformerConfig get_default_transformer_config() {
-  return TransformerConfig{/*num_features=*/512,
-                           /*sequence_length=*/512,
-                           /*batch_size=*/64,
-                           /*dim_feedforward=*/2048,
-                           /*num_heads=*/8,
-                           /*num_encoder_layers=*/6,
-                           /*num_decoder_layers=*/6,
+  return TransformerConfig{/*num_features=*/512_n,
+                           /*sequence_length=*/512_n,
+                           /*batch_size=*/64_n,
+                           /*dim_feedforward=*/2048_n,
+                           /*num_heads=*/8_n,
+                           /*num_encoder_layers=*/6_n,
+                           /*num_decoder_layers=*/6_n,
                            /*dropout=*/0.1,
                            /*layer_norm_eps=*/1e-05,
-                           /*vocab_size=*/64};
+                           /*vocab_size=*/64_n};
 }
 
 tensor_guid_t create_feedforward_network(ComputationGraphBuilder &cgb,
@@ -32,18 +32,20 @@ tensor_guid_t create_feedforward_network(ComputationGraphBuilder &cgb,
 tensor_guid_t create_transformer_encoder_layer(ComputationGraphBuilder &cgb,
                                                TransformerConfig const &config,
                                                tensor_guid_t const &input) {
-  std::vector<int> layer_norm_axis{2}; // Normalize the last dim
-  int kdim = config.dim_feedforward / config.num_heads;
-  int vdim = config.dim_feedforward / config.num_heads;
-  tensor_guid_t self_attention = cgb.multihead_attention(input,
-                                                         input,
-                                                         input,
-                                                         config.num_features,
-                                                         config.num_heads,
-                                                         kdim,
-                                                         vdim,
-                                                         config.dropout,
-                                                         /*bias=*/false);
+  std::vector<relative_ff_dim_t> layer_norm_axis = {
+      relative_ff_dim_t{-1}}; // Normalize the last dim
+  nonnegative_int kdim = config.dim_feedforward / config.num_heads;
+  nonnegative_int vdim = config.dim_feedforward / config.num_heads;
+  tensor_guid_t self_attention =
+      cgb.multihead_attention(/*query=*/input,
+                              /*key=*/input,
+                              /*value=*/input,
+                              /*embed_dim=*/config.num_features,
+                              /*num_heads=*/config.num_heads,
+                              /*kdim=*/kdim,
+                              /*vdim=*/vdim,
+                              /*dropout=*/config.dropout,
+                              /*bias=*/false);
   assert(are_tensor_guid_shapes_equivalent(
       cgb.computation_graph, input, self_attention));
 
@@ -79,18 +81,20 @@ tensor_guid_t
                                      TransformerConfig const &config,
                                      tensor_guid_t const &input,
                                      tensor_guid_t const &encoder_output) {
-  std::vector<int> layer_norm_axis{2}; // Normalize the last dim
-  int kdim = config.dim_feedforward / config.num_heads;
-  int vdim = config.dim_feedforward / config.num_heads;
-  tensor_guid_t self_attention = cgb.multihead_attention(input,
-                                                         input,
-                                                         input,
-                                                         config.num_features,
-                                                         config.num_heads,
-                                                         kdim,
-                                                         vdim,
-                                                         config.dropout,
-                                                         /*bias=*/false);
+  std::vector<relative_ff_dim_t> layer_norm_axis = {
+      relative_ff_dim_t{-1}}; // Normalize the last dim
+  nonnegative_int kdim = config.dim_feedforward / config.num_heads;
+  nonnegative_int vdim = config.dim_feedforward / config.num_heads;
+  tensor_guid_t self_attention =
+      cgb.multihead_attention(/*query=*/input,
+                              /*key=*/input,
+                              /*value=*/input,
+                              /*embed_dim=*/config.num_features,
+                              /*num_heads=*/config.num_heads,
+                              /*kdim=*/kdim,
+                              /*vdim=*/vdim,
+                              /*dropout=*/config.dropout,
+                              /*bias=*/false);
   assert(are_tensor_guid_shapes_equivalent(
       cgb.computation_graph, input, self_attention));
 
@@ -102,15 +106,16 @@ tensor_guid_t
   assert(are_tensor_guid_shapes_equivalent(
       cgb.computation_graph, input, self_attention_normalized));
 
-  tensor_guid_t mha = cgb.multihead_attention(self_attention_normalized,
-                                              encoder_output,
-                                              encoder_output,
-                                              config.num_features,
-                                              config.num_heads,
-                                              kdim,
-                                              vdim,
-                                              config.dropout,
-                                              /*bias=*/false);
+  tensor_guid_t mha =
+      cgb.multihead_attention(/*query=*/self_attention_normalized,
+                              /*key=*/encoder_output,
+                              /*value=*/encoder_output,
+                              /*embed_dim=*/config.num_features,
+                              /*num_heads=*/config.num_heads,
+                              /*kdim=*/kdim,
+                              /*vdim=*/vdim,
+                              /*dropout=*/config.dropout,
+                              /*bias=*/false);
   assert(are_tensor_guid_shapes_equivalent(cgb.computation_graph, input, mha));
 
   tensor_guid_t mha_normalized =
@@ -148,7 +153,7 @@ ComputationGraph
   ComputationGraphBuilder cgb;
 
   TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<size_t>{
+      TensorDims{FFOrdered<nonnegative_int>{
           config.batch_size, config.sequence_length, config.num_features}},
       DataType::FLOAT,
   };
diff --git a/lib/op-attrs/include/op-attrs/computation_graph_op_attrs.variant.toml b/lib/op-attrs/include/op-attrs/computation_graph_op_attrs.variant.toml
index 014526a601..f1c5fe6b23 100644
--- a/lib/op-attrs/include/op-attrs/computation_graph_op_attrs.variant.toml
+++ b/lib/op-attrs/include/op-attrs/computation_graph_op_attrs.variant.toml
@@ -11,7 +11,7 @@ features = [
 
 includes = [
   "op-attrs/ops/attention_attrs.dtg.h", 
-  "op-attrs/ops/batch_matmul.dtg.h", 
+  "op-attrs/ops/batch_matmul_attrs.dtg.h", 
   "op-attrs/ops/batch_norm_attrs.dtg.h", 
   "op-attrs/ops/broadcast_attrs.dtg.h",
   "op-attrs/ops/cast_attrs.dtg.h", 
diff --git a/lib/op-attrs/include/op-attrs/datatype.h b/lib/op-attrs/include/op-attrs/datatype.h
index 5af00fb510..3a817af38c 100644
--- a/lib/op-attrs/include/op-attrs/datatype.h
+++ b/lib/op-attrs/include/op-attrs/datatype.h
@@ -4,6 +4,7 @@
 #include "op-attrs/datatype.dtg.h"
 #include "utils/fmt.h"
 #include "utils/fp16.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <variant>
 
 namespace FlexFlow {
@@ -49,7 +50,7 @@ typename data_type_enum_to_class<DT>::type cast_to(T t) {
 template <DataType DT>
 using real_type_t = typename data_type_enum_to_class<DT>::type;
 
-size_t size_of_datatype(DataType);
+nonnegative_int size_of_datatype(DataType);
 
 bool can_strictly_promote_datatype_from_to(DataType, DataType);
 
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
index 3977f4e0fd..f2355289dc 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
@@ -32,19 +32,13 @@ struct DimOrdered {
       : contents(contents.begin(), contents.end()) {}
 
   T const &at(Idx idx) const {
-    int raw = idx.value;
-    if (raw < 0) {
-      raw = this->contents.size() + raw;
-    }
-    return this->contents.at(raw);
+    nonnegative_int raw = idx.value;
+    return this->contents.at(raw.unwrap_nonnegative());
   }
 
   T &at(Idx idx) {
-    int raw = idx.value;
-    if (raw < 0) {
-      raw = this->contents.size() + raw;
-    }
-    return this->contents.at(raw);
+    nonnegative_int raw = idx.value;
+    return this->contents.at(raw.unwrap_nonnegative());
   }
 
   T const &operator[](Idx idx) const {
@@ -56,11 +50,8 @@ struct DimOrdered {
   }
 
   bool idx_is_valid(Idx const &idx) const {
-    int raw = idx.value;
-    if (raw < 0) {
-      raw = this->contents.size() + raw;
-    }
-    return (raw >= 0 && raw < this->contents.size());
+    nonnegative_int raw = idx.value;
+    return (raw < this->contents.size());
   }
 
   bool operator==(DimOrdered const &other) const {
@@ -172,7 +163,7 @@ struct DimOrdered<ff_dim_t, T> {
       : contents(contents.begin(), contents.end()) {}
 
   T const &at(ff_dim_t idx) const {
-    int raw = idx.value.get_value();
+    int raw = idx.value.unwrap_nonnegative();
     return this->contents.at(raw);
   }
 
@@ -185,7 +176,7 @@ struct DimOrdered<ff_dim_t, T> {
   }
 
   T &at(ff_dim_t idx) {
-    int raw = idx.value.get_value();
+    int raw = idx.value.unwrap_nonnegative();
     return this->contents.at(raw);
   }
 
@@ -214,7 +205,7 @@ struct DimOrdered<ff_dim_t, T> {
   }
 
   bool idx_is_valid(ff_dim_t const &idx) const {
-    int raw = idx.value.get_value();
+    int raw = idx.value.unwrap_nonnegative();
     return raw < this->contents.size();
   }
 
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
index c9e6db4d17..166916dd44 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
@@ -27,8 +27,8 @@ FFOrdered<T> ff_dim_t_nonoverloaded_slice(FFOrdered<T> const &d,
                                           std::optional<ff_dim_t> const &end) {
   auto to_raw_idx =
       [](std::optional<ff_dim_t> const &idx) -> std::optional<int> {
-    return transform(idx,
-                     [](ff_dim_t const &i) { return i.value.get_value(); });
+    return transform(
+        idx, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); });
   };
 
   return FFOrdered<T>{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
diff --git a/lib/op-attrs/include/op-attrs/get_op_type.h b/lib/op-attrs/include/op-attrs/get_op_type.h
index b60880a98b..7799900709 100644
--- a/lib/op-attrs/include/op-attrs/get_op_type.h
+++ b/lib/op-attrs/include/op-attrs/get_op_type.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_OP_ATTRS_GET_OP_TYPE_H
 
 #include "op-attrs/ops/attention_attrs.dtg.h"
-#include "op-attrs/ops/batch_matmul.dtg.h"
+#include "op-attrs/ops/batch_matmul_attrs.dtg.h"
 #include "op-attrs/ops/batch_norm_attrs.dtg.h"
 #include "op-attrs/ops/broadcast_attrs.dtg.h"
 #include "op-attrs/ops/cast_attrs.dtg.h"
diff --git a/lib/op-attrs/include/op-attrs/ops/attention.h b/lib/op-attrs/include/op-attrs/ops/attention.h
index e06d795c04..5f1b11c1bb 100644
--- a/lib/op-attrs/include/op-attrs/ops/attention.h
+++ b/lib/op-attrs/include/op-attrs/ops/attention.h
@@ -12,31 +12,31 @@
 
 namespace FlexFlow {
 
-int get_qProjSize(MultiHeadAttentionAttrs const &);
-int get_vProjSize(MultiHeadAttentionAttrs const &);
-int get_kProjSize(MultiHeadAttentionAttrs const &);
-int get_oProjSize(MultiHeadAttentionAttrs const &);
+nonnegative_int get_qProjSize(MultiHeadAttentionAttrs const &);
+nonnegative_int get_vProjSize(MultiHeadAttentionAttrs const &);
+nonnegative_int get_kProjSize(MultiHeadAttentionAttrs const &);
+nonnegative_int get_oProjSize(MultiHeadAttentionAttrs const &);
 
-int get_qSize(MultiHeadAttentionParallelInputs const &);
-int get_qSize(MultiHeadAttentionInputs const &);
+nonnegative_int get_qSize(MultiHeadAttentionParallelInputs const &);
+nonnegative_int get_qSize(MultiHeadAttentionInputs const &);
 
-int get_kSize(MultiHeadAttentionParallelInputs const &);
-int get_kSize(MultiHeadAttentionInputs const &);
+nonnegative_int get_kSize(MultiHeadAttentionParallelInputs const &);
+nonnegative_int get_kSize(MultiHeadAttentionInputs const &);
 
-int get_vSize(MultiHeadAttentionParallelInputs const &);
-int get_vSize(MultiHeadAttentionInputs const &);
+nonnegative_int get_vSize(MultiHeadAttentionParallelInputs const &);
+nonnegative_int get_vSize(MultiHeadAttentionInputs const &);
 
-int get_oSize(ParallelTensorShape const &);
-int get_oSize(TensorShape const &);
+nonnegative_int get_oSize(ParallelTensorShape const &);
+nonnegative_int get_oSize(TensorShape const &);
 
-int get_qoSeqLength(MultiHeadAttentionParallelInputs const &);
-int get_qoSeqLength(MultiHeadAttentionInputs const &);
+nonnegative_int get_qoSeqLength(MultiHeadAttentionParallelInputs const &);
+nonnegative_int get_qoSeqLength(MultiHeadAttentionInputs const &);
 
-int get_kvSeqLength(MultiHeadAttentionParallelInputs const &);
-int get_kvSeqLength(MultiHeadAttentionInputs const &);
+nonnegative_int get_kvSeqLength(MultiHeadAttentionParallelInputs const &);
+nonnegative_int get_kvSeqLength(MultiHeadAttentionInputs const &);
 
-int get_num_samples(MultiHeadAttentionParallelInputs const &);
-int get_num_samples(MultiHeadAttentionInputs const &);
+nonnegative_int get_num_samples(MultiHeadAttentionParallelInputs const &);
+nonnegative_int get_num_samples(MultiHeadAttentionInputs const &);
 
 std::vector<IncomingTensorRole>
     get_attention_incoming_tensor_roles(MultiHeadAttentionAttrs const &);
diff --git a/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml b/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml
index b82b285451..f85b7268af 100644
--- a/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml
@@ -10,29 +10,29 @@ features = [
 ]
 
 includes = [
-  "<cstddef>",
   "op-attrs/datatype.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
 name = "batch_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "sequence_length"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "query_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "key_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "value_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "datatype"
diff --git a/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml
index d96d8af69c..019131b07c 100644
--- a/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml
@@ -10,21 +10,25 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "embed_dim"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_heads"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "kdim"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "vdim"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "dropout"
diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
index 574b4ef579..333da4fa29 100644
--- a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
+++ b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_BATCH_MATMUL_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_BATCH_MATMUL_H
 
-#include "op-attrs/ops/batch_matmul.dtg.h"
+#include "op-attrs/ops/batch_matmul_attrs.dtg.h"
 #include "op-attrs/ops/core.h"
 #include "op-attrs/parallel_tensor_shape.dtg.h"
 #include "op-attrs/tensor_shape.dtg.h"
diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul.struct.toml b/lib/op-attrs/include/op-attrs/ops/batch_matmul.struct.toml
deleted file mode 100644
index 3b1dd3f687..0000000000
--- a/lib/op-attrs/include/op-attrs/ops/batch_matmul.struct.toml
+++ /dev/null
@@ -1,19 +0,0 @@
-namespace = "FlexFlow"
-name = "BatchMatmulAttrs"
-
-features = [
-  "eq",
-  "ord",
-  "hash",
-  "json",
-  "rapidcheck",
-  "fmt",
-]
-
-[[fields]]
-name = "a_seq_length_dim"
-type = "int"
-
-[[fields]]
-name = "b_seq_length_dim"
-type = "int"
diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/batch_matmul_attrs.struct.toml
new file mode 100644
index 0000000000..394dfb5fcc
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ops/batch_matmul_attrs.struct.toml
@@ -0,0 +1,30 @@
+namespace = "FlexFlow"
+name = "BatchMatmulAttrs"
+
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "json",
+  "rapidcheck",
+  "fmt",
+]
+
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+  "<optional>",
+]
+
+src_includes = [
+  "utils/fmt/optional.h",
+  "utils/json/optional.h",
+  "utils/rapidcheck/optional.h",
+]
+
+[[fields]]
+name = "a_seq_length_dim"
+type = "std::optional<::FlexFlow::nonnegative_int>"
+
+[[fields]]
+name = "b_seq_length_dim"
+type = "std::optional<::FlexFlow::nonnegative_int>"
diff --git a/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml
index e7eeedec06..b3c574264c 100644
--- a/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml
@@ -12,6 +12,7 @@ features = [
 includes = [
   "op-attrs/ff_dim_t.h",
   "op-attrs/ff_dim_t.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -20,4 +21,4 @@ type = "::FlexFlow::ff_dim_t"
 
 [[fields]]
 name = "combine_degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml
index 77e8c51244..c4fb74ebd8 100644
--- a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml
@@ -12,23 +12,24 @@ features = [
 includes = [
   "<cstddef>",
   "op-attrs/datatype.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
 name = "num_samples"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_channels"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "height"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "width"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "datatype"
diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml
index 68cbd878d1..fdf0eaca78 100644
--- a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml
@@ -12,6 +12,7 @@ features = [
 includes = [
   "op-attrs/shard_parallel_dim.dtg.h",
   "op-attrs/datatype.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -32,11 +33,11 @@ type = "::FlexFlow::ShardParallelDim"
 
 [[fields]]
 name = "sum_reduction_degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "discard_copy_reduction_degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "datatype"
diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml
index 5bef144cd9..8b86d42e04 100644
--- a/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml
@@ -12,6 +12,7 @@ features = [
 includes = [
   "<optional>",
   "op-attrs/activation.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -21,14 +22,14 @@ src_includes = [
 ]
 
 fields = [
-  { name = "out_channels", type = "int" },
-  { name = "kernel_h", type = "int" },
-  { name = "kernel_w", type = "int" },
-  { name = "stride_h", type = "int" },
-  { name = "stride_w", type = "int" },
-  { name = "padding_h", type = "int" },
-  { name = "padding_w", type = "int" },
-  { name = "groups", type = "int" },
+  { name = "out_channels", type = "::FlexFlow::nonnegative_int" },
+  { name = "kernel_h", type = "::FlexFlow::nonnegative_int" },
+  { name = "kernel_w", type = "::FlexFlow::nonnegative_int" },
+  { name = "stride_h", type = "::FlexFlow::nonnegative_int" },
+  { name = "stride_w", type = "::FlexFlow::nonnegative_int" },
+  { name = "padding_h", type = "::FlexFlow::nonnegative_int" },
+  { name = "padding_w", type = "::FlexFlow::nonnegative_int" },
+  { name = "groups", type = "::FlexFlow::nonnegative_int" },
   { name = "activation", type = "std::optional<::FlexFlow::Activation>" },
   { name = "use_bias", type = "bool" },
 ]
diff --git a/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml
index b8d15284e9..5a857efb3e 100644
--- a/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml
@@ -10,9 +10,10 @@ features = [
 ]
 
 includes = [
-  "utils/stack_vector/stack_vector.h",
   "op-attrs/aggregate_op.dtg.h",
   "op-attrs/datatype.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
+  "<optional>",
 ]
 
 src_includes = [
@@ -23,11 +24,11 @@ src_includes = [
 
 [[fields]]
 name = "num_entries"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "out_channels"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "aggr"
diff --git a/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml
index 0a35a6c5ec..ffbe93c975 100644
--- a/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml
@@ -14,6 +14,7 @@ includes = [
   "op-attrs/activation.dtg.h",
   "op-attrs/regularizer_attrs.dtg.h",
   "<optional>",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -24,7 +25,7 @@ src_includes = [
 
 [[fields]]
 name = "out_channels"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "use_bias"
diff --git a/lib/op-attrs/include/op-attrs/ops/pool_2d.h b/lib/op-attrs/include/op-attrs/ops/pool_2d.h
index 1af22ad022..af11d61f07 100644
--- a/lib/op-attrs/include/op-attrs/ops/pool_2d.h
+++ b/lib/op-attrs/include/op-attrs/ops/pool_2d.h
@@ -13,8 +13,8 @@ CHECK_VALID_OP_ATTR(Pool2DAttrs);
 
 tl::expected<Pool2DAttrs, std::string>
     make_adaptive_pool2d_attrs(TensorDims const &input_dims,
-                               int output_h,
-                               int output_w,
+                               nonnegative_int output_h,
+                               nonnegative_int output_w,
                                PoolOp pool_type,
                                std::optional<Activation> const &activation);
 
diff --git a/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml
index 20ca7deabc..fea318d46d 100644
--- a/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml
@@ -13,6 +13,7 @@ includes = [
   "op-attrs/pool_op.dtg.h",
   "op-attrs/activation.dtg.h",
   "<optional>",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -23,27 +24,27 @@ src_includes = [
 
 [[fields]]
 name = "kernel_h"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "kernel_w"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "stride_h"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "stride_w"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "padding_h"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "padding_w"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "pool_type"
diff --git a/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml
index ee0ae54132..2798a85caf 100644
--- a/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml
@@ -9,6 +9,10 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "reduction_degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml
index 69c4b7580f..965c40c05a 100644
--- a/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml
@@ -12,6 +12,7 @@ features = [
 includes = [
   "op-attrs/ff_dim_t.h",
   "op-attrs/ff_dim_t.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -20,4 +21,4 @@ type = "::FlexFlow::ff_dim_t"
 
 [[fields]]
 name = "repartition_degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml
index 4e43ea747a..58e365c0f2 100644
--- a/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml
@@ -9,8 +9,10 @@ features = [
   "fmt",
 ]
 
-includes = [ ]
+includes = [ 
+  "utils/nonnegative_int/nonnegative_int.h",
+]
 
 [[fields]]
 name = "replicate_degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml
index fce827f5c2..7ce1ad7e34 100644
--- a/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml
@@ -13,11 +13,12 @@ includes = [
   "utils/stack_vector/stack_vector.h",
   "op-attrs/ff_dim_t.h",
   "op-attrs/ff_dim_t.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
 name = "splits"
-type = "::FlexFlow::stack_vector<int, MAX_NUM_OUTPUTS>"
+type = "::FlexFlow::stack_vector<::FlexFlow::nonnegative_int, MAX_NUM_OUTPUTS>"
 
 [[fields]]
 name = "axis"
diff --git a/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml
index 9ecbf1d725..1c5bfc8e10 100644
--- a/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml
@@ -9,9 +9,13 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "k"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "sorted"
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
index 974b27d2a7..be3a95eec8 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
@@ -13,6 +13,7 @@ includes = [
   "op-attrs/parallel_tensor_shape/sum_degree.dtg.h",
   "op-attrs/parallel_tensor_shape/discard_copy_degree.dtg.h",
   "op-attrs/dim_ordered/dim_ordered.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -25,4 +26,4 @@ type = "::FlexFlow::DiscardCopyDegree"
 
 [[fields]]
 name = "shard_degrees"
-type = "::FlexFlow::FFOrdered<int>"
+type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>"
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
index 6b88a7bda1..67864e637b 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
@@ -9,27 +9,27 @@
 namespace FlexFlow {
 
 FFOrdered<ShardParallelDim> ff_ordered_shard_dims(ParallelTensorDims const &);
-FFOrdered<int> ff_ordered_shard_degrees(ParallelTensorDims const &);
+FFOrdered<nonnegative_int> ff_ordered_shard_degrees(ParallelTensorDims const &);
 std::unordered_set<ReplicaParallelDim> replica_dims(ParallelTensorDims const &);
 
 /* size_t get_volume(ParallelTensorDims const &); */
-size_t num_shard_dims(ParallelTensorDims const &);
+nonnegative_int num_shard_dims(ParallelTensorDims const &);
 
 ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorDims const &);
 
 ParallelTensorDims lift_to_parallel(TensorDims const &);
-ParallelTensorDims
-    lift_to_parallel_with_degrees(TensorDims const &,
-                                  SumDegree const &,
-                                  DiscardCopyDegree const &,
-                                  FFOrdered<int> const &shard_degrees);
+ParallelTensorDims lift_to_parallel_with_degrees(
+    TensorDims const &,
+    SumDegree const &,
+    DiscardCopyDegree const &,
+    FFOrdered<nonnegative_int> const &shard_degrees);
 ParallelTensorDims
     lift_to_parallel_with_degrees(TensorDims const &,
                                   ParallelTensorDimDegrees const &);
 
-int total_replica_degree(ParallelTensorDims const &);
-int total_shard_degree(ParallelTensorDims const &);
-int total_parallel_degree(ParallelTensorDims const &);
+nonnegative_int total_replica_degree(ParallelTensorDims const &);
+nonnegative_int total_shard_degree(ParallelTensorDims const &);
+nonnegative_int total_parallel_degree(ParallelTensorDims const &);
 
 ShardParallelDim shard_dim_at_idx(ParallelTensorDims const &,
                                   relative_ff_dim_t);
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
index 0339b9b8a6..d461ffc9e4 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
@@ -12,12 +12,13 @@
 
 namespace FlexFlow {
 
-int num_shard_dims(ParallelTensorShape const &);
+nonnegative_int num_shard_dims(ParallelTensorShape const &);
 ShardParallelDim shard_dim_at_idx(ParallelTensorShape const &,
                                   relative_ff_dim_t);
 ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &, relative_ff_dim_t);
 
-FFOrdered<int> ff_ordered_shard_degrees(ParallelTensorShape const &);
+FFOrdered<nonnegative_int>
+    ff_ordered_shard_degrees(ParallelTensorShape const &);
 
 std::optional<ShardParallelDim>
     try_get_shard_dim_at_idx(ParallelTensorShape const &, relative_ff_dim_t);
@@ -25,11 +26,11 @@ std::optional<ShardParallelDim>
 ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorShape const &);
 
 ParallelTensorShape lift_to_parallel(TensorShape const &);
-ParallelTensorShape
-    lift_to_parallel_with_degrees(TensorShape const &,
-                                  SumDegree const &,
-                                  DiscardCopyDegree const &,
-                                  FFOrdered<int> const &shard_degrees);
+ParallelTensorShape lift_to_parallel_with_degrees(
+    TensorShape const &,
+    SumDegree const &,
+    DiscardCopyDegree const &,
+    FFOrdered<nonnegative_int> const &shard_degrees);
 ParallelTensorShape
     lift_to_parallel_with_degrees(TensorShape const &,
                                   ParallelTensorDimDegrees const &);
@@ -37,13 +38,13 @@ ParallelTensorShape
 std::unordered_set<ReplicaParallelDim>
     replica_dims(ParallelTensorShape const &);
 TensorShape get_piece_shape(ParallelTensorShape const &);
-int get_num_replica_dims(ParallelTensorShape const &);
-int get_num_replicas(ParallelTensorShape const &);
+nonnegative_int get_num_replica_dims(ParallelTensorShape const &);
+nonnegative_int get_num_replicas(ParallelTensorShape const &);
 
-int get_sum_degree(ParallelTensorShape const &);
-int get_discard_copy_degree(ParallelTensorShape const &);
+nonnegative_int get_sum_degree(ParallelTensorShape const &);
+nonnegative_int get_discard_copy_degree(ParallelTensorShape const &);
 
-int get_total_parallel_degree(ParallelTensorShape const &);
+nonnegative_int get_total_parallel_degree(ParallelTensorShape const &);
 
 bool is_valid(ParallelTensorShape const &);
 
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml
index b4905fb0ce..76b52bcdef 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml
@@ -9,6 +9,10 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "value"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml
index d86917211e..550a384ba9 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml
@@ -9,6 +9,10 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "value"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/op-attrs/include/op-attrs/pcg_operator_attrs.variant.toml b/lib/op-attrs/include/op-attrs/pcg_operator_attrs.variant.toml
index a44d712dbf..fdd11ac11f 100644
--- a/lib/op-attrs/include/op-attrs/pcg_operator_attrs.variant.toml
+++ b/lib/op-attrs/include/op-attrs/pcg_operator_attrs.variant.toml
@@ -11,7 +11,7 @@ features = [
 
 includes = [
   "op-attrs/ops/attention_attrs.dtg.h", 
-  "op-attrs/ops/batch_matmul.dtg.h", 
+  "op-attrs/ops/batch_matmul_attrs.dtg.h", 
   "op-attrs/ops/batch_norm_attrs.dtg.h", 
   "op-attrs/ops/broadcast_attrs.dtg.h", 
   "op-attrs/ops/cast_attrs.dtg.h", 
diff --git a/lib/op-attrs/include/op-attrs/relative_ff_dim_t.h b/lib/op-attrs/include/op-attrs/relative_ff_dim_t.h
index af51cc69be..5205b1ead8 100644
--- a/lib/op-attrs/include/op-attrs/relative_ff_dim_t.h
+++ b/lib/op-attrs/include/op-attrs/relative_ff_dim_t.h
@@ -7,7 +7,7 @@
 
 namespace FlexFlow {
 ff_dim_t ff_dim_t_from_relative_ff_dim_t(relative_ff_dim_t ff_dim,
-                                         int input_dim);
+                                         nonnegative_int input_dim);
 } // namespace FlexFlow
 
 namespace rc {
diff --git a/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml b/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml
index 2ad442aa22..5ca486181e 100644
--- a/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml
+++ b/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml
@@ -11,11 +11,12 @@ features = [
 
 includes = [
   "op-attrs/replica_type.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
 name = "degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "replica_type"
diff --git a/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h b/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h
index 74a8df339b..92d2b0abb2 100644
--- a/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h
+++ b/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h
@@ -8,7 +8,8 @@
 namespace FlexFlow {
 
 ReplicaParallelDimSet empty_replica_parallel_dim_set();
-int get_degree_of_replica_type(ReplicaParallelDimSet const &, ReplicaType);
+nonnegative_int get_degree_of_replica_type(ReplicaParallelDimSet const &,
+                                           ReplicaType);
 std::unordered_set<ReplicaParallelDim>
     get_replica_dims(ReplicaParallelDimSet const &);
 bool is_valid(ReplicaParallelDimSet const &);
diff --git a/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml b/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml
index 21c81396d1..5c5d2dc5b2 100644
--- a/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml
+++ b/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml
@@ -9,10 +9,14 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.h b/lib/op-attrs/include/op-attrs/tensor_dims.h
index 5e1503360b..bf11f36e51 100644
--- a/lib/op-attrs/include/op-attrs/tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/tensor_dims.h
@@ -6,11 +6,11 @@
 
 namespace FlexFlow {
 
-FFOrdered<size_t> const &ff_ordered(TensorDims const &);
+FFOrdered<nonnegative_int> const &ff_ordered(TensorDims const &);
 
-size_t num_dims(TensorDims const &);
-size_t dim_at_idx(TensorDims const &, relative_ff_dim_t);
-size_t &dim_at_idx(TensorDims &, relative_ff_dim_t);
+nonnegative_int num_dims(TensorDims const &);
+nonnegative_int dim_at_idx(TensorDims const &, relative_ff_dim_t);
+nonnegative_int &dim_at_idx(TensorDims &, relative_ff_dim_t);
 
 bool tensor_dims_is_broadcastable_to(TensorDims const &curr,
                                      TensorDims const &goal);
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
index b262dd32b6..e86b866fd6 100644
--- a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
+++ b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
@@ -8,10 +8,12 @@ features = [
   "rapidcheck",
   "fmt",
 ]
+
 includes = [
   "op-attrs/dim_ordered/dim_ordered.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
 name = "ff_ordered"
-type = "::FlexFlow::FFOrdered<size_t>"
+type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>"
diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h
index b8733cddbe..15958a1daf 100644
--- a/lib/op-attrs/include/op-attrs/tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/tensor_shape.h
@@ -5,11 +5,11 @@
 
 namespace FlexFlow {
 
-size_t num_dims(TensorShape const &);
-size_t dim_at_idx(TensorShape const &, relative_ff_dim_t);
-size_t &dim_at_idx(TensorShape &, relative_ff_dim_t);
-size_t get_num_elements(TensorShape const &);
-size_t get_size_in_bytes(TensorShape const &);
+nonnegative_int num_dims(TensorShape const &);
+nonnegative_int dim_at_idx(TensorShape const &, relative_ff_dim_t);
+nonnegative_int &dim_at_idx(TensorShape &, relative_ff_dim_t);
+nonnegative_int get_num_elements(TensorShape const &);
+nonnegative_int get_size_in_bytes(TensorShape const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/src/op-attrs/datatype.cc b/lib/op-attrs/src/op-attrs/datatype.cc
index 3bee05c253..9bb3b34390 100644
--- a/lib/op-attrs/src/op-attrs/datatype.cc
+++ b/lib/op-attrs/src/op-attrs/datatype.cc
@@ -1,23 +1,24 @@
 #include "op-attrs/datatype.h"
 #include "utils/containers/contains.h"
 #include "utils/exception.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
-size_t size_of_datatype(DataType data_type) {
+nonnegative_int size_of_datatype(DataType data_type) {
   switch (data_type) {
     case DataType::BOOL:
-      return sizeof(bool);
+      return nonnegative_int{sizeof(bool)};
     case DataType::INT32:
-      return sizeof(int32_t);
+      return nonnegative_int{sizeof(int32_t)};
     case DataType::INT64:
-      return sizeof(int64_t);
+      return nonnegative_int{sizeof(int64_t)};
     case DataType::HALF:
-      return sizeof(float) / 2;
+      return nonnegative_int{sizeof(float)} / 2_n;
     case DataType::FLOAT:
-      return sizeof(float);
+      return nonnegative_int{sizeof(float)};
     case DataType::DOUBLE:
-      return sizeof(double);
+      return nonnegative_int{sizeof(double)};
     default:
       throw mk_runtime_error(fmt::format("Unknown DataType {}", data_type));
   }
diff --git a/lib/op-attrs/src/op-attrs/ff_dim_t.cc b/lib/op-attrs/src/op-attrs/ff_dim_t.cc
index 0a99e39a91..44672fc391 100644
--- a/lib/op-attrs/src/op-attrs/ff_dim_t.cc
+++ b/lib/op-attrs/src/op-attrs/ff_dim_t.cc
@@ -2,7 +2,7 @@
 
 namespace FlexFlow {
 relative_ff_dim_t relative_ff_dim_t_from_ff_dim_t(ff_dim_t ff_dim) {
-  return relative_ff_dim_t{ff_dim.value.get_value()};
+  return relative_ff_dim_t{ff_dim.value.unwrap_nonnegative()};
 }
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/src/op-attrs/ops/attention.cc b/lib/op-attrs/src/op-attrs/ops/attention.cc
index 57c7105534..10fbf412f7 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention.cc
@@ -16,79 +16,82 @@ namespace FlexFlow {
 /*   return is_valid; */
 /* } */
 
-int get_qProjSize(MultiHeadAttentionAttrs const &attrs) {
+nonnegative_int get_qProjSize(MultiHeadAttentionAttrs const &attrs) {
   return attrs.kdim;
 }
 
-int get_vProjSize(MultiHeadAttentionAttrs const &attrs) {
+nonnegative_int get_vProjSize(MultiHeadAttentionAttrs const &attrs) {
   return attrs.vdim;
 }
 
-int get_kProjSize(MultiHeadAttentionAttrs const &attrs) {
+nonnegative_int get_kProjSize(MultiHeadAttentionAttrs const &attrs) {
   return attrs.kdim;
 }
 
-int get_oProjSize(MultiHeadAttentionAttrs const &attrs) {
+nonnegative_int get_oProjSize(MultiHeadAttentionAttrs const &attrs) {
   return attrs.embed_dim;
 }
 
-int get_qSize(TensorShape const &query_shape) {
+nonnegative_int get_qSize(TensorShape const &query_shape) {
   return dim_at_idx(query_shape, relative_ff_dim_t{0});
 }
 
-int get_kSize(TensorShape const &key_shape) {
+nonnegative_int get_kSize(TensorShape const &key_shape) {
   return dim_at_idx(key_shape, relative_ff_dim_t{0});
 }
 
-int get_vSize(TensorShape const &value_shape) {
+nonnegative_int get_vSize(TensorShape const &value_shape) {
   return dim_at_idx(value_shape, relative_ff_dim_t{0});
 }
 
-int get_qSize(MultiHeadAttentionParallelInputs const &inputs) {
+nonnegative_int get_qSize(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.query_dim.size;
 }
 
-int get_qSize(MultiHeadAttentionInputs const &inputs) {
+nonnegative_int get_qSize(MultiHeadAttentionInputs const &inputs) {
   return inputs.query_size;
 }
 
-int get_kSize(MultiHeadAttentionParallelInputs const &inputs) {
+nonnegative_int get_kSize(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.key_dim.size;
 }
 
-int get_kSize(MultiHeadAttentionInputs const &inputs) {
+nonnegative_int get_kSize(MultiHeadAttentionInputs const &inputs) {
   return inputs.key_size;
 }
 
-int get_vSize(MultiHeadAttentionParallelInputs const &inputs) {
+nonnegative_int get_vSize(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.value_dim.size;
 }
 
-int get_vSize(MultiHeadAttentionInputs const &inputs) {
+nonnegative_int get_vSize(MultiHeadAttentionInputs const &inputs) {
   return inputs.value_size;
 }
 
-int get_kvSeqLength(MultiHeadAttentionParallelInputs const &inputs) {
+nonnegative_int
+    get_kvSeqLength(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.sequence_dim.size;
 }
 
-int get_kvSeqLength(MultiHeadAttentionInputs const &inputs) {
+nonnegative_int get_kvSeqLength(MultiHeadAttentionInputs const &inputs) {
   return inputs.sequence_length;
 }
 
-int get_qoSeqLength(MultiHeadAttentionParallelInputs const &inputs) {
+nonnegative_int
+    get_qoSeqLength(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.sequence_dim.size; // FIXME -- assumes only prefill
 }
 
-int get_qoSeqLength(MultiHeadAttentionInputs const &inputs) {
+nonnegative_int get_qoSeqLength(MultiHeadAttentionInputs const &inputs) {
   return inputs.sequence_length; // FIXME -- assumes only prefil
 }
 
-int get_num_samples(MultiHeadAttentionParallelInputs const &inputs) {
+nonnegative_int
+    get_num_samples(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.batch_dim.size;
 }
 
-int get_num_samples(MultiHeadAttentionInputs const &inputs) {
+nonnegative_int get_num_samples(MultiHeadAttentionInputs const &inputs) {
   return inputs.batch_size;
 }
 
@@ -124,10 +127,10 @@ tl::expected<TensorShape, std::string>
   MultiHeadAttentionInputs parsed = parse_result.value();
 
   return TensorShape{
-      TensorDims{FFOrdered<size_t>{
+      TensorDims{FFOrdered<nonnegative_int>{
           parsed.batch_size,
           parsed.sequence_length,
-          size_t_from_int(attrs.embed_dim),
+          attrs.embed_dim,
       }},
       parsed.datatype,
   };
@@ -147,23 +150,23 @@ tl::expected<TensorShape, std::string>
   MultiHeadAttentionInputs parsed = parse_result.value();
 
   // W^Q_i in "Attention Is All You Need" top of page 5
-  size_t qProjectWeightSize = parsed.query_size * attrs.kdim;
+  nonnegative_int qProjectWeightSize = parsed.query_size * attrs.kdim;
 
   // W^K_i in "Attention Is All You Need" top of page 5 (all i's put together)
-  size_t kProjectWeightSize = parsed.key_size * attrs.kdim;
+  nonnegative_int kProjectWeightSize = parsed.key_size * attrs.kdim;
 
   // W^V_i in "Attention Is All You Need" top of page 5 (all i's put together)
-  size_t vProjectWeightSize = parsed.value_size * attrs.vdim;
+  nonnegative_int vProjectWeightSize = parsed.value_size * attrs.vdim;
 
   // W^O in "Attention Is All You Need" top of page 5, with num_heads factored
   // out
-  size_t outWeightSize = attrs.vdim * attrs.embed_dim;
+  nonnegative_int outWeightSize = attrs.vdim * attrs.embed_dim;
 
   return TensorShape{
-      TensorDims{FFOrdered<size_t>{
+      TensorDims{FFOrdered<nonnegative_int>{
           (qProjectWeightSize + kProjectWeightSize + vProjectWeightSize +
            outWeightSize),
-          size_t_from_int(attrs.num_heads),
+          attrs.num_heads,
       }},
       parsed.datatype,
   };
@@ -184,8 +187,8 @@ tl::expected<TensorShape, std::string>
   });
 
   return TensorShape{
-      TensorDims{FFOrdered<size_t>{
-          size_t_from_int(attrs.kdim + attrs.kdim + attrs.vdim),
+      TensorDims{FFOrdered<nonnegative_int>{
+          attrs.kdim + attrs.kdim + attrs.vdim,
       }},
       parsed.datatype,
   };
@@ -206,8 +209,8 @@ tl::expected<TensorShape, std::string>
   });
 
   return TensorShape{
-      TensorDims{FFOrdered<size_t>{
-          size_t_from_int(attrs.embed_dim),
+      TensorDims{FFOrdered<nonnegative_int>{
+          attrs.embed_dim,
       }},
       parsed.datatype,
   };
@@ -235,14 +238,14 @@ tl::expected<ParallelTensorShape, std::string>
   }
   TensorShape unpar_shape = result_unpar_get_shape.value();
 
-  int joined_dim_degree = 1;
-  int head_dim_degree = parsed.discard_copy_degree.value;
+  nonnegative_int joined_dim_degree = 1_n;
+  nonnegative_int head_dim_degree = parsed.discard_copy_degree.value;
 
   return lift_to_parallel_with_degrees(
       unpar_shape,
-      SumDegree{1},
+      SumDegree{1_n},
       DiscardCopyDegree{parsed.batch_dim.degree},
-      FFOrdered<int>{joined_dim_degree, head_dim_degree});
+      FFOrdered<nonnegative_int>{joined_dim_degree, head_dim_degree});
 }
 
 tl::expected<ParallelTensorShape, std::string>
@@ -273,10 +276,10 @@ tl::expected<ParallelTensorShape, std::string>
     result_unpar.value();
   });
 
-  SumDegree sum_degree = SumDegree{1};
+  SumDegree sum_degree = SumDegree{1_n};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{
       parsed.batch_dim.degree * parsed.discard_copy_degree.value};
-  FFOrdered<int> shard_degrees = FFOrdered<int>{1};
+  FFOrdered<nonnegative_int> shard_degrees = FFOrdered<nonnegative_int>{1_n};
   return lift_to_parallel_with_degrees(
       unpar_shape, sum_degree, discard_copy_degree, shard_degrees);
 }
@@ -309,10 +312,10 @@ tl::expected<ParallelTensorShape, std::string>
     result_unpar.value();
   });
 
-  SumDegree sum_degree = SumDegree{1};
+  SumDegree sum_degree = SumDegree{1_n};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{
       parsed.batch_dim.degree * parsed.discard_copy_degree.value};
-  FFOrdered<int> shard_degrees = FFOrdered<int>{1};
+  FFOrdered<nonnegative_int> shard_degrees = FFOrdered<nonnegative_int>{1_n};
   return lift_to_parallel_with_degrees(
       unpar_shape, sum_degree, discard_copy_degree, shard_degrees);
 }
@@ -339,402 +342,25 @@ tl::expected<ParallelTensorShape, std::string>
   }
   TensorShape unpar_shape = result_unpar_get_shape.value();
 
-  int sum_degree = parsed.discard_copy_degree.value;
-  int discard_copy_degree = 1;
-  int batch_degree = parsed.batch_dim.degree;
-  int seq_len_degree = 1;
-  int out_dim_degree = 1;
+  nonnegative_int sum_degree = parsed.discard_copy_degree.value;
+  nonnegative_int discard_copy_degree = 1_n;
+  nonnegative_int batch_degree = parsed.batch_dim.degree;
+  nonnegative_int seq_len_degree = 1_n;
+  nonnegative_int out_dim_degree = 1_n;
 
   return lift_to_parallel_with_degrees(
       unpar_shape,
       SumDegree{sum_degree},
       DiscardCopyDegree{discard_copy_degree},
-      FFOrdered<int>{batch_degree, seq_len_degree, out_dim_degree});
+      FFOrdered<nonnegative_int>{batch_degree, seq_len_degree, out_dim_degree});
 }
 
-int get_oSize(ParallelTensorShape const &) {
+nonnegative_int get_oSize(ParallelTensorShape const &) {
   NOT_IMPLEMENTED();
 }
 
-int get_oSize(TensorShape const &) {
+nonnegative_int get_oSize(TensorShape const &) {
   NOT_IMPLEMENTED();
 }
 
 } // namespace FlexFlow
-
-// Tensor FFModel::multihead_attention(const Tensor query,
-//                                     const Tensor key,
-//                                     const Tensor value,
-//                                     int embed_dim,
-//                                     int num_heads,
-//                                     int kdim,
-//                                     int vdim,
-//                                     float dropout,
-//                                     bool bias,
-//                                     bool add_bias_kv,
-//                                     bool add_zero_attn,
-//                                     Initializer *kernel_initializer,
-//                                     char const *name) {
-//   Layer *li = new Layer(this,
-//                         OP_MULTIHEAD_ATTENTION,
-//                         DT_FLOAT,
-//                         name,
-//                         3 /*inputs*/,
-//                         1 /*weights*/,
-//                         1 /*outputs*/,
-//                         query,
-//                         key,
-//                         value);
-//   {
-//     int numdims = query->num_dims;
-//     int dims[MAX_TENSOR_DIM];
-//     for (int i = 0; i < numdims; i++) {
-//       dims[i] = query->dims[i];
-//     }
-//     dims[0] = embed_dim;
-//     li->outputs[0] = create_tensor_legion_ordering(
-//         numdims, dims, DT_FLOAT, li, 0, true /*create_grad*/);
-//   }
-//   {
-//     // Compute weight size
-//     int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim,
-//         oProjSize = embed_dim;
-//     int qSize = query->dims[0], kSize = key->dims[0], vSize = value->dims[0];
-//     int qParas = qProjSize * qSize;
-//     int kParas = kProjSize * kSize;
-//     int vParas = vProjSize * vSize;
-//     int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize);
-//     int dims[2] = {qParas + kParas + vParas + oParas, num_heads};
-//     li->weights[0] = create_weight_legion_ordering(2,
-//                                                    dims,
-//                                                    DT_FLOAT,
-//                                                    li,
-//                                                    true /*create_grad*/,
-//                                                    kernel_initializer,
-//                                                    CHOSEN_SYNC_TYPE);
-//   }
-//   li->data_type = DT_FLOAT;
-//   li->add_int_property("embed_dim", embed_dim);
-//   li->add_int_property("num_heads", num_heads);
-//   li->add_int_property("kdim", kdim);
-//   li->add_int_property("vdim", vdim);
-//   li->add_int_property("bias", bias);
-//   li->add_int_property("add_bias_kv", add_bias_kv);
-//   li->add_int_property("add_zero_attn", add_zero_attn);
-//   li->add_float_property("dropout", dropout);
-//   layers.push_back(li);
-//   return li->outputs[0];
-// }
-
-// MultiHeadAttention::MultiHeadAttention(FFModel &model,
-//                                        LayerID const &_layer_guid,
-//                                        const ParallelTensor _query,
-//                                        const ParallelTensor _key,
-//                                        const ParallelTensor _value,
-//                                        int _embed_dim,
-//                                        int _num_heads,
-//                                        int _kdim,
-//                                        int _vdim,
-//                                        float _dropout,
-//                                        bool _bias,
-//                                        bool _add_bias_kv,
-//                                        bool _add_zero_attn,
-//                                        bool allocate_weights,
-//                                        char const *name)
-//     // Initializer* _bias_initializer)
-//     : Op(model,
-//          OP_MULTIHEAD_ATTENTION,
-//          DT_FLOAT,
-//          name,
-//          3 /*inputs*/,
-//          1 /*weights*/,
-//          1 /*outputs*/,
-//          _query,
-//          _key,
-//          _value),
-//       attrs(_embed_dim,
-//             _num_heads,
-//             _kdim,
-//             _vdim,
-//             _dropout,
-//             _bias,
-//             _add_bias_kv,
-//             _add_zero_attn),
-//       qSize(_query->dims[0].size), kSize(_key->dims[0].size),
-//       vSize(_value->dims[0].size), qProjSize(_kdim),
-//       qoSeqLength(_query->dims[1].size), kvSeqLength(_key->dims[1].size) {
-//   // overwrite layer_guid
-//   layer_guid = _layer_guid;
-
-//   // assert key and value have the same sequence length
-//   assert(_key->dims[1] == _value->dims[1]);
-//   numOutputs = 1;
-//   int numdim = _query->num_dims;
-//   ParallelDim dims[MAX_TENSOR_DIM];
-//   for (int i = 0; i < numdim; i++) {
-//     dims[i] = _query->dims[i];
-//   }
-//   dims[0].size = _embed_dim;
-//   // Currently require no parallelism along this dim
-//   assert(dims[0].degree == 1);
-//   if (allocate_weights) {
-//     // Create weight tensor
-//     int num_dims = inputs[0]->num_dims;
-//     // Compute weight size
-//     int qParas = this->qProjSize * this->qSize;
-//     int kParas = kProjSize(attrs) * this->kSize;
-//     int vParas = vProjSize(attrs) * this->vSize;
-//     int oParas = oProjSize(attrs) *
-//                  (vProjSize(attrs) > 0 ? vProjSize(attrs) : this->vSize);
-//     ParallelDim dims[3];
-//     dims[0] = inputs[0]->dims[num_dims - 2];
-//     dims[0].size = dims[0].degree;
-//     dims[1] = inputs[0]->dims[num_dims - 1];
-//     dims[1].size = this->attrs.num_heads;
-//     dims[2].size = qParas + kParas + vParas + oParas;
-//     dims[2].degree = 1;
-//     dims[2].parallel_idx = -1;
-//     int seed = std::rand();
-//     Initializer *initializer = new GlorotUniform(seed);
-// #ifdef USE_NCCL
-//     ParameterSyncType comm_type = ParameterSyncType::NCCL;
-// #else
-//     ParameterSyncType comm_type = ParameterSyncType::PS;
-// #endif
-//     weights[0] = model.create_parallel_weight<3>(dims,
-//                                                  DT_FLOAT,
-//                                                  NULL /*owner_op*/,
-//                                                  true /*create_grad*/,
-//                                                  initializer,
-//                                                  comm_type);
-//   }
-
-//   outputs[0] = model.create_parallel_tensor_legion_ordering(
-//       _query->num_dims, dims, DT_FLOAT, this);
-//   /* for (int i = 0; i < numdim; i++) { */
-//   /*   register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */
-//   /* } */
-//   /* // Check correctness */
-//   /* assert(check_output_input_weight_parallel_dims()); */
-// }
-
-// MultiHeadAttention::MultiHeadAttention(FFModel &model,
-//                                        const ParallelTensor _query,
-//                                        const ParallelTensor _key,
-//                                        const ParallelTensor _value,
-//                                        const ParallelTensor _weight,
-//                                        int _embed_dim,
-//                                        int _num_heads,
-//                                        int _kdim,
-//                                        int _vdim,
-//                                        float _dropout,
-//                                        bool _bias,
-//                                        bool _add_bias_kv,
-//                                        bool _add_zero_attn,
-//                                        bool allocate_weights,
-//                                        char const *name)
-//     // Initializer* _bias_initializer)
-//     : Op(model,
-//          OP_MULTIHEAD_ATTENTION,
-//          DT_FLOAT,
-//          name,
-//          3 /*inputs*/,
-//          1 /*weights*/,
-//          1 /*outputs*/,
-//          _query,
-//          _key,
-//          _value,
-//          _weight),
-//       attrs(_embed_dim,
-//             _num_heads,
-//             _kdim,
-//             _vdim,
-//             _dropout,
-//             _bias,
-//             _add_bias_kv,
-//             _add_zero_attn),
-//       qSize(_query->dims[0].size), kSize(_key->dims[0].size),
-//       vSize(_value->dims[0].size), qProjSize(_kdim),
-//       qoSeqLength(_query->dims[1].size), kvSeqLength(_key->dims[1].size)
-// // bias_initializer(_bias_initializer)
-// {
-//   // assert key and value have the same sequence length
-//   assert(_key->dims[1] == _value->dims[1]);
-//   numOutputs = 1;
-//   int numdim = _query->num_dims;
-//   ParallelDim dims[MAX_TENSOR_DIM];
-//   for (int i = 0; i < numdim; i++) {
-//     dims[i] = _query->dims[i];
-//   }
-//   // assert key and value have the same sequence length
-//   assert(_key->dims[1] == _value->dims[1]);
-//   dims[0].size = _embed_dim;
-//   // Currently require no parallelism along this dim
-//   assert(dims[0].degree == 1);
-//   if (allocate_weights) {
-//     // Create weight tensor
-//     int num_dims = inputs[0]->num_dims;
-//     // Compute weight size
-//     int qParas = this->qProjSize * this->qSize;
-//     int kParas = kProjSize(attrs) * this->kSize;
-//     int vParas = vProjSize(attrs) * this->vSize;
-//     int oParas = oProjSize(attrs) *
-//                  (vProjSize(attrs) > 0 ? vProjSize(attrs) : this->vSize);
-//     ParallelDim dims[3];
-//     dims[0] = inputs[0]->dims[num_dims - 2];
-//     dims[0].size = dims[0].degree;
-//     dims[1] = inputs[0]->dims[num_dims - 1];
-//     dims[1].size = this->attrs.num_heads;
-//     dims[2].size = qParas + kParas + vParas + oParas;
-//     int seed = std::rand();
-//     Initializer *initializer = new GlorotUniform(seed);
-// #ifdef USE_NCCL
-//     ParameterSyncType comm_type = ParameterSyncType::NCCL;
-// #else
-//     ParameterSyncType comm_type = ParameterSyncType::PS;
-// #endif
-//     weights[0] = model.create_parallel_weight<3>(dims,
-//                                                  DT_FLOAT,
-//                                                  NULL /*owner_op*/,
-//                                                  true /*create_grad*/,
-//                                                  initializer,
-//                                                  comm_type);
-//   }
-//   outputs[0] = model.create_parallel_tensor_legion_ordering(
-//       _query->num_dims, dims, DT_FLOAT, this);
-
-//   /* for (int i = 0; i < numdim; i++) { */
-//   /*   register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */
-//   /* } */
-//   /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1);
-//   */
-//   /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2);
-//   */
-//   // Check correctness
-//   /* assert(check_output_input_weight_parallel_dims()); */
-// }
-
-// void MultiHeadAttention::forward(FFModel const &ff) {
-//   ArgumentMap argmap;
-//   Context ctx = ff.config.lg_ctx;
-//   Runtime *runtime = ff.config.lg_hlr;
-//   set_argumentmap_for_forward(ff, argmap);
-//   int idx = 0;
-//   IndexLauncher launcher(ATTENTION_FWD_TASK_ID,
-//                          parallel_is,
-//                          TaskArgument(NULL, 0),
-//                          argmap,
-//                          Predicate::TRUE_PRED,
-//                          false /*must*/,
-//                          0 /*mapper_id*/,
-//                          outputs[0]->machine_view.hash());
-//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[0]->region));
-//   launcher.add_field(idx++, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[1]->region));
-//   launcher.add_field(idx++, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(inputs[2]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[2]->region));
-//   launcher.add_field(idx++, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     weights[0]->region));
-//   launcher.add_field(idx++, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     WRITE_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     outputs[0]->region));
-//   launcher.add_field(4, FID_DATA);
-//   runtime->execute_index_space(ctx, launcher);
-// }
-
-// void MultiHeadAttention::backward(FFModel const &ff) {
-//   ArgumentMap argmap;
-//   Context ctx = ff.config.lg_ctx;
-//   Runtime *runtime = ff.config.lg_hlr;
-//   set_argumentmap_for_backward(ff, argmap);
-//   IndexLauncher launcher(ATTENTION_BWD_TASK_ID,
-//                          parallel_is,
-//                          TaskArgument(NULL, 0),
-//                          argmap,
-//                          Predicate::TRUE_PRED,
-//                          false /*must*/,
-//                          0 /*mapper_id*/,
-//                          outputs[0]->machine_view.hash());
-//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[0]->region));
-//   launcher.add_field(0, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[1]->region));
-//   launcher.add_field(1, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(inputs[2]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[2]->region));
-//   launcher.add_field(2, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     weights[0]->region));
-//   launcher.add_field(3, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     outputs[0]->region_grad));
-//   launcher.add_field(4, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
-//                                                     0 /*projection id*/,
-//                                                     READ_WRITE,
-//                                                     EXCLUSIVE,
-//                                                     weights[0]->region_grad));
-//   launcher.add_field(5, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
-//                                                     0 /*projection id*/,
-//                                                     READ_WRITE,
-//                                                     EXCLUSIVE,
-//                                                     inputs[0]->region_grad));
-//   launcher.add_field(6, FID_DATA);
-//   int num_regions = 7;
-//   if (inputs[1]->region != inputs[0]->region) {
-//     // when key != query
-//     launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
-//                                                       0 /*projection id*/,
-//                                                       READ_WRITE,
-//                                                       EXCLUSIVE,
-//                                                       inputs[1]->region_grad));
-//     launcher.add_field(num_regions++, FID_DATA);
-//   }
-//   if ((inputs[2]->region != inputs[0]->region) &&
-//       (inputs[2]->region != inputs[1]->region)) {
-//     // when value != key and value != query
-//     launcher.add_region_requirement(RegionRequirement(inputs[2]->part_grad,
-//                                                       0 /*projection id*/,
-//                                                       READ_WRITE,
-//                                                       EXCLUSIVE,
-//                                                       inputs[2]->region_grad));
-//     launcher.add_field(num_regions++, FID_DATA);
-//   }
-//   runtime->execute_index_space(ctx, launcher);
-// }
diff --git a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc
index 97544d1750..b9049bf461 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc
@@ -31,9 +31,9 @@ tl::expected<MultiHeadAttentionInputs, std::string>
                     3));
   }
 
-  size_t seq_len_q = dim_at_idx(input_q, relative_ff_dim_t{-2});
-  size_t seq_len_k = dim_at_idx(input_k, relative_ff_dim_t{-2});
-  size_t seq_len_v = dim_at_idx(input_v, relative_ff_dim_t{-2});
+  nonnegative_int seq_len_q = dim_at_idx(input_q, relative_ff_dim_t{-2});
+  nonnegative_int seq_len_k = dim_at_idx(input_k, relative_ff_dim_t{-2});
+  nonnegative_int seq_len_v = dim_at_idx(input_v, relative_ff_dim_t{-2});
 
   if (!all_same(seq_len_q, seq_len_k, seq_len_v)) {
     return tl::unexpected(fmt::format(
@@ -43,9 +43,9 @@ tl::expected<MultiHeadAttentionInputs, std::string>
         seq_len_v));
   }
 
-  size_t batch_size_q = dim_at_idx(input_q, relative_ff_dim_t{-3});
-  size_t batch_size_k = dim_at_idx(input_k, relative_ff_dim_t{-3});
-  size_t batch_size_v = dim_at_idx(input_v, relative_ff_dim_t{-3});
+  nonnegative_int batch_size_q = dim_at_idx(input_q, relative_ff_dim_t{-3});
+  nonnegative_int batch_size_k = dim_at_idx(input_k, relative_ff_dim_t{-3});
+  nonnegative_int batch_size_v = dim_at_idx(input_v, relative_ff_dim_t{-3});
 
   if (!all_same(batch_size_q, batch_size_k, batch_size_v)) {
     return tl::unexpected(fmt::format(
@@ -63,9 +63,9 @@ tl::expected<MultiHeadAttentionInputs, std::string>
         input_v.data_type));
   }
 
-  size_t q_size = dim_at_idx(input_q, relative_ff_dim_t{-1});
-  size_t k_size = dim_at_idx(input_k, relative_ff_dim_t{-1});
-  size_t v_size = dim_at_idx(input_v, relative_ff_dim_t{-1});
+  nonnegative_int q_size = dim_at_idx(input_q, relative_ff_dim_t{-1});
+  nonnegative_int k_size = dim_at_idx(input_k, relative_ff_dim_t{-1});
+  nonnegative_int v_size = dim_at_idx(input_v, relative_ff_dim_t{-1});
 
   return MultiHeadAttentionInputs{
       batch_size_q,
diff --git a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc
index 3bd0825555..d69b62b759 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc
@@ -107,9 +107,9 @@ tl::expected<MultiHeadAttentionParallelInputs, std::string>
                     value_dim.degree));
   }
 
-  int discard_copy_q = get_discard_copy_degree(input_q);
-  int discard_copy_k = get_discard_copy_degree(input_k);
-  int discard_copy_v = get_discard_copy_degree(input_v);
+  nonnegative_int discard_copy_q = get_discard_copy_degree(input_q);
+  nonnegative_int discard_copy_k = get_discard_copy_degree(input_k);
+  nonnegative_int discard_copy_v = get_discard_copy_degree(input_v);
 
   if (!all_same(discard_copy_q, discard_copy_k, discard_copy_v)) {
     return tl::unexpected(fmt::format("Q, K, V disagree on the discard-copy "
diff --git a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
index 71118db7a6..d32ae33d14 100644
--- a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
+++ b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
@@ -57,13 +57,13 @@ tl::expected<TensorShape, std::string>
                                       input_rhs.data_type));
   }
 
-  size_t lhs_b = dim_at_idx(input_lhs, relative_ff_dim_t{0});
-  size_t n = dim_at_idx(input_lhs, relative_ff_dim_t{1});
-  size_t lhs_m = dim_at_idx(input_lhs, relative_ff_dim_t{2});
+  nonnegative_int lhs_b = dim_at_idx(input_lhs, relative_ff_dim_t{0});
+  nonnegative_int n = dim_at_idx(input_lhs, relative_ff_dim_t{1});
+  nonnegative_int lhs_m = dim_at_idx(input_lhs, relative_ff_dim_t{2});
 
-  size_t rhs_b = dim_at_idx(input_rhs, relative_ff_dim_t{0});
-  size_t rhs_m = dim_at_idx(input_rhs, relative_ff_dim_t{1});
-  size_t p = dim_at_idx(input_rhs, relative_ff_dim_t{2});
+  nonnegative_int rhs_b = dim_at_idx(input_rhs, relative_ff_dim_t{0});
+  nonnegative_int rhs_m = dim_at_idx(input_rhs, relative_ff_dim_t{1});
+  nonnegative_int p = dim_at_idx(input_rhs, relative_ff_dim_t{2});
 
   if (lhs_b != rhs_b) {
     return tl::unexpected(
@@ -76,7 +76,7 @@ tl::expected<TensorShape, std::string>
 
   return TensorShape{
       TensorDims{
-          FFOrdered<size_t>{
+          FFOrdered<nonnegative_int>{
               lhs_b,
               n,
               p,
@@ -151,9 +151,10 @@ tl::expected<ParallelTensorShape, std::string>
   ShardParallelDim output_n = n;
   ShardParallelDim output_p = p;
 
-  int output_discard_copy_degree = 1;
-  int output_sum_degree = get_total_parallel_degree(input_lhs) /
-                          (output_b.degree * output_n.degree * output_p.degree);
+  nonnegative_int output_discard_copy_degree = 1_n;
+  nonnegative_int output_sum_degree =
+      get_total_parallel_degree(input_lhs) /
+      (output_b.degree * output_n.degree * output_p.degree);
 
   ParallelTensorShape result = ParallelTensorShape{
       ParallelTensorDims{
diff --git a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
index 472e5f1a25..ed58fe5189 100644
--- a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
@@ -67,10 +67,10 @@ tl::expected<TensorShape, std::string>
     return tl::unexpected("No gamma weights exist for attrs.affine = false");
   }
 
-  size_t num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
+  nonnegative_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
 
   return TensorShape{
-      TensorDims{FFOrdered<size_t>{
+      TensorDims{FFOrdered<nonnegative_int>{
           num_channels,
       }},
       DataType::FLOAT,
@@ -97,26 +97,23 @@ static std::optional<std::string>
                        input_degrees);
   }
 
-  if (input_degrees.sum_degree != SumDegree{1}) {
+  if (input_degrees.sum_degree != SumDegree{1_n}) {
     return fmt::format("Expected sum degree 1, but receieved sum degree {}",
                        input_degrees.sum_degree);
   }
 
-  if (input_degrees.discard_copy_degree != DiscardCopyDegree{1}) {
+  if (input_degrees.discard_copy_degree != DiscardCopyDegree{1_n}) {
     return fmt::format(
         "Expected discard copy degree 1, but receieved discard copy degree {}",
         input_degrees.discard_copy_degree);
   }
 
-  FFOrdered<int> non_channel_degrees =
-      concat(slice(input_degrees.shard_degrees,
-                   ff_dim_t{nonnegative_int{0}},
-                   ff_dim_t{nonnegative_int{1}}),
-             slice(input_degrees.shard_degrees,
-                   ff_dim_t{nonnegative_int{2}},
-                   std::nullopt));
+  FFOrdered<nonnegative_int> non_channel_degrees =
+      concat(slice(input_degrees.shard_degrees, ff_dim_t{0_n}, ff_dim_t{1_n}),
+             slice(input_degrees.shard_degrees, ff_dim_t{2_n}, std::nullopt));
 
-  if (any_of(non_channel_degrees, [](int degree) { return degree != 1; })) {
+  if (any_of(non_channel_degrees,
+             [](nonnegative_int degree) { return degree != 1_n; })) {
     return fmt::format("Expected parallel degree of all non-channel dimensions "
                        "to be 1, but received input with degrees {}",
                        input_degrees);
@@ -159,9 +156,9 @@ tl::expected<ParallelTensorDimDegrees, std::string>
   relative_ff_dim_t channel_dim = relative_ff_dim_t{1};
 
   return ParallelTensorDimDegrees{
-      SumDegree{1},
-      DiscardCopyDegree{1},
-      FFOrdered<int>{input_degrees.shard_degrees.at(channel_dim)},
+      SumDegree{1_n},
+      DiscardCopyDegree{1_n},
+      FFOrdered<nonnegative_int>{input_degrees.shard_degrees.at(channel_dim)},
   };
 }
 
diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc
index 3019151236..fc42241ef2 100644
--- a/lib/op-attrs/src/op-attrs/ops/concat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/concat.cc
@@ -17,7 +17,8 @@ tl::expected<TensorShape, std::string>
     get_output_shape(ConcatAttrs const &attrs,
                      std::vector<TensorShape> const &inputs) {
   auto get_non_axis_dims = [&](TensorShape const &s) {
-    std::map<ff_dim_t, size_t> dim_sizes = enumerate(ff_ordered(s.dims));
+    std::map<ff_dim_t, nonnegative_int> dim_sizes =
+        enumerate(ff_ordered(s.dims));
     dim_sizes.erase(attrs.axis);
     return dim_sizes;
   };
@@ -40,8 +41,8 @@ tl::expected<TensorShape, std::string>
                     inputs));
   }
 
-  std::map<ff_dim_t, size_t> non_axis_dims = ({
-    tl::expected<std::map<ff_dim_t, size_t>, std::string> returned =
+  std::map<ff_dim_t, nonnegative_int> non_axis_dims = ({
+    tl::expected<std::map<ff_dim_t, nonnegative_int>, std::string> returned =
         require_all_same1(transform(inputs, get_non_axis_dims));
     if (!returned.has_value()) {
       return tl::unexpected(returned.error());
@@ -49,12 +50,12 @@ tl::expected<TensorShape, std::string>
     returned.value();
   });
 
-  std::vector<size_t> axis_dim_sizes =
+  std::vector<nonnegative_int> axis_dim_sizes =
       transform(inputs, [&](TensorShape const &s) {
         return dim_at_idx(s, relative_ff_dim_t_from_ff_dim_t(attrs.axis));
       });
 
-  size_t output_axis_dim_size = sum(axis_dim_sizes);
+  nonnegative_int output_axis_dim_size = sum(axis_dim_sizes);
 
   non_axis_dims.insert({attrs.axis, output_axis_dim_size});
 
@@ -88,7 +89,7 @@ tl::expected<ParallelTensorShape, std::string>
   });
 
   SumDegree sum_degree = ({
-    tl::expected<int, std::string> returned =
+    tl::expected<nonnegative_int, std::string> returned =
         require_all_same1(transform(inputs, get_sum_degree));
     if (!returned.has_value()) {
       return tl::unexpected(returned.error());
@@ -97,7 +98,7 @@ tl::expected<ParallelTensorShape, std::string>
   });
 
   DiscardCopyDegree discard_copy_degree = ({
-    tl::expected<int, std::string> returned =
+    tl::expected<nonnegative_int, std::string> returned =
         require_all_same1(transform(inputs, get_discard_copy_degree));
     if (!returned.has_value()) {
       return tl::unexpected(returned.error());
diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
index eac756cc15..d1ba536d24 100644
--- a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
+++ b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
@@ -25,11 +25,11 @@ TensorShape get_kernel_shape(Conv2DAttrs const &attrs,
   Conv2DInputShape input = parse_input_shape(raw_input_shape);
 
   return TensorShape{
-      TensorDims{FFOrdered<size_t>{
-          size_t_from_int(attrs.out_channels),
+      TensorDims{FFOrdered<nonnegative_int>{
+          attrs.out_channels,
           input.num_channels,
-          size_t_from_int(attrs.kernel_h),
-          size_t_from_int(attrs.kernel_w),
+          attrs.kernel_h,
+          attrs.kernel_w,
       }},
       input.datatype,
   };
@@ -42,29 +42,44 @@ TensorShape get_bias_shape(Conv2DAttrs const &attrs,
 
   return TensorShape{
       TensorDims{
-          FFOrdered<size_t>{size_t_from_int(attrs.out_channels)},
+          FFOrdered<nonnegative_int>{attrs.out_channels},
       },
       input.datatype,
   };
 }
 
+static nonnegative_int calculate_output_size(nonnegative_int input_size,
+                                             nonnegative_int padding_size,
+                                             nonnegative_int kernel_size,
+                                             nonnegative_int stride) {
+  int input_size_raw = input_size.unwrap_nonnegative();
+  int padding_raw = padding_size.unwrap_nonnegative();
+  int kernel_size_raw = kernel_size.unwrap_nonnegative();
+  int stride_raw = stride.unwrap_nonnegative();
+
+  return nonnegative_int{
+      (input_size_raw + (2 * padding_raw) - kernel_size_raw) / stride_raw + 1};
+}
+
 TensorShape get_output_shape(Conv2DAttrs const &attrs,
                              TensorShape const &raw_input_shape) {
   assert(attrs.groups == 1); // TODO(@lockshaw): currently not supported
   Conv2DInputShape input = parse_input_shape(raw_input_shape);
 
-  size_t out_height =
-      (input.height + (2 * attrs.padding_h) - attrs.kernel_h) / attrs.stride_h +
-      1;
-  size_t out_width =
-      (input.width + (2 * attrs.padding_w) - attrs.kernel_w) / attrs.stride_w +
-      1;
-
-  assert(attrs.out_channels > 0);
-
-  return TensorShape{TensorDims{FFOrdered<size_t>{
+  nonnegative_int out_height =
+      calculate_output_size(/*input_size=*/input.height,
+                            /*padding_size=*/attrs.padding_h,
+                            /*kernel_size=*/attrs.kernel_h,
+                            /*stride_size=*/attrs.stride_h);
+  nonnegative_int out_width =
+      calculate_output_size(/*input_size=*/input.width,
+                            /*padding_size=*/attrs.padding_w,
+                            /*kernel_size=*/attrs.kernel_w,
+                            /*stride_size=*/attrs.stride_w);
+
+  return TensorShape{TensorDims{FFOrdered<nonnegative_int>{
                          input.num_samples,
-                         size_t_from_int(attrs.out_channels),
+                         attrs.out_channels,
                          out_height,
                          out_width,
                      }},
@@ -82,14 +97,14 @@ ParallelTensorShape get_kernel_shape(Conv2DAttrs const &attrs,
   assert(parsed.height_dim.degree == 1);
   assert(parsed.width_dim.degree == 1);
 
-  SumDegree sum_degree = SumDegree{1};
+  SumDegree sum_degree = SumDegree{1_n};
   DiscardCopyDegree discard_copy_degree =
       DiscardCopyDegree{parsed.sample_dim.degree * parsed.sum_reduction_degree};
-  FFOrdered<int> shard_degrees = {
+  FFOrdered<nonnegative_int> shard_degrees = {
       parsed.discard_copy_reduction_degree,
       parsed.channel_dim.degree,
-      1,
-      1,
+      1_n,
+      1_n,
   };
 
   return lift_to_parallel_with_degrees(
@@ -109,7 +124,7 @@ ParallelTensorShape get_bias_shape(Conv2DAttrs const &attrs,
   DiscardCopyDegree discard_copy_degree =
       DiscardCopyDegree{parsed.height_dim.degree * parsed.width_dim.degree *
                         parsed.sample_dim.degree};
-  FFOrdered<int> shard_degrees = {
+  FFOrdered<nonnegative_int> shard_degrees = {
       parsed.discard_copy_reduction_degree,
   };
 
@@ -130,12 +145,12 @@ ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs,
 
   SumDegree sum_degree =
       SumDegree{parsed.sum_reduction_degree * parsed.channel_dim.degree};
-  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1};
-  FFOrdered<int> shard_degrees = {
+  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_n};
+  FFOrdered<nonnegative_int> shard_degrees = {
       parsed.sample_dim.degree,
       parsed.discard_copy_reduction_degree,
-      1,
-      1,
+      1_n,
+      1_n,
   };
 
   return lift_to_parallel_with_degrees(
diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc
index aad067feb2..1491410491 100644
--- a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc
+++ b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc
@@ -6,10 +6,10 @@ namespace FlexFlow {
 Conv2DInputShape parse_input_shape(TensorShape const &input) {
   assert(num_dims(input) == 4);
 
-  size_t num_samples = dim_at_idx(input, relative_ff_dim_t{0});
-  size_t in_channels = dim_at_idx(input, relative_ff_dim_t{1});
-  size_t in_height = dim_at_idx(input, relative_ff_dim_t{2});
-  size_t in_width = dim_at_idx(input, relative_ff_dim_t{3});
+  nonnegative_int num_samples = dim_at_idx(input, relative_ff_dim_t{0});
+  nonnegative_int in_channels = dim_at_idx(input, relative_ff_dim_t{1});
+  nonnegative_int in_height = dim_at_idx(input, relative_ff_dim_t{2});
+  nonnegative_int in_width = dim_at_idx(input, relative_ff_dim_t{3});
 
   return Conv2DInputShape{
       num_samples,
diff --git a/lib/op-attrs/src/op-attrs/ops/embedding.cc b/lib/op-attrs/src/op-attrs/ops/embedding.cc
index fe557695da..29bd70be2f 100644
--- a/lib/op-attrs/src/op-attrs/ops/embedding.cc
+++ b/lib/op-attrs/src/op-attrs/ops/embedding.cc
@@ -50,9 +50,9 @@ tl::expected<TensorShape, std::string>
 
   return TensorShape{
       TensorDims{
-          FFOrdered<size_t>{
-              size_t_from_int(attrs.num_entries),
-              size_t_from_int(attrs.out_channels),
+          FFOrdered<nonnegative_int>{
+              attrs.num_entries,
+              attrs.out_channels,
           },
       },
       attrs.data_type,
@@ -74,8 +74,8 @@ tl::expected<ParallelTensorShape, std::string>
 
   SumDegree sum_degree =
       SumDegree{shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree};
-  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1};
-  FFOrdered<int> shard_degrees =
+  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_n};
+  FFOrdered<nonnegative_int> shard_degrees =
       transform(input.dims.shard_dims,
                 [](ShardParallelDim const &d) { return d.degree; });
   shard_degrees.at(relative_ff_dim_t{-1}) = get_discard_copy_degree(input);
@@ -96,13 +96,13 @@ tl::expected<ParallelTensorShape, std::string>
     result_unpar.value();
   });
 
-  SumDegree sum_degree = SumDegree{1};
-  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(
-      transform(ff_ordered_shard_dims(input.dims),
-                [](ShardParallelDim const &d) -> int { return d.degree; }))};
-  int entry_dim_degree = 1;
-  int out_channel_degree = get_discard_copy_degree(input);
-  FFOrdered<int> shard_degrees = {
+  SumDegree sum_degree = SumDegree{1_n};
+  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(transform(
+      ff_ordered_shard_dims(input.dims),
+      [](ShardParallelDim const &d) -> nonnegative_int { return d.degree; }))};
+  nonnegative_int entry_dim_degree = 1_n;
+  nonnegative_int out_channel_degree = get_discard_copy_degree(input);
+  FFOrdered<nonnegative_int> shard_degrees = {
       entry_dim_degree,
       out_channel_degree,
   };
diff --git a/lib/op-attrs/src/op-attrs/ops/flat.cc b/lib/op-attrs/src/op-attrs/ops/flat.cc
index bc86102566..8ed12167b3 100644
--- a/lib/op-attrs/src/op-attrs/ops/flat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/flat.cc
@@ -11,12 +11,11 @@ namespace FlexFlow {
 
 TensorShape get_output_shape(FlatAttrs const &attrs,
                              TensorShape const &input_shape) {
-  FFOrdered<size_t> leading_dims = slice(ff_ordered(input_shape.dims),
-                                         ff_dim_t{nonnegative_int{0}},
-                                         attrs.start_dim);
-  FFOrdered<size_t> flattened_dims =
+  FFOrdered<nonnegative_int> leading_dims =
+      slice(ff_ordered(input_shape.dims), ff_dim_t{0_n}, attrs.start_dim);
+  FFOrdered<nonnegative_int> flattened_dims =
       slice(ff_ordered(input_shape.dims), attrs.start_dim, attrs.end_dim);
-  FFOrdered<size_t> trailing_dims =
+  FFOrdered<nonnegative_int> trailing_dims =
       slice(ff_ordered(input_shape.dims), attrs.end_dim, std::nullopt);
 
   if (flattened_dims.empty()) {
@@ -38,14 +37,15 @@ TensorShape get_output_shape(FlatAttrs const &attrs,
 tl::expected<ParallelTensorDimDegrees, std::string>
     get_output_parallel_dim_degrees(
         FlatAttrs const &attrs, ParallelTensorDimDegrees const &input_degrees) {
-  FFOrdered<int> flattened_dim_degrees =
+  FFOrdered<nonnegative_int> flattened_dim_degrees =
       slice(input_degrees.shard_degrees, attrs.start_dim, attrs.end_dim);
 
   if (flattened_dim_degrees.empty()) {
     return input_degrees;
   }
 
-  if (any_of(flattened_dim_degrees, [](int degree) { return degree != 1; })) {
+  if (any_of(flattened_dim_degrees,
+             [](nonnegative_int degree) { return degree != 1; })) {
     return tl::unexpected(
         fmt::format("get_output_parallel_dim_degrees for {} expected all shard "
                     "degrees of flattened dimensions to be 1, but received {}",
@@ -58,9 +58,7 @@ tl::expected<ParallelTensorDimDegrees, std::string>
       /*discard_copy_degree=*/input_degrees.discard_copy_degree,
       /*shard_degrees=*/
       concat(std::vector{
-          slice(input_degrees.shard_degrees,
-                ff_dim_t{nonnegative_int{0}},
-                attrs.start_dim),
+          slice(input_degrees.shard_degrees, ff_dim_t{0_n}, attrs.start_dim),
           {product(flattened_dim_degrees)},
           slice(input_degrees.shard_degrees, attrs.end_dim, std::nullopt),
       }),
diff --git a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
index 86426dd18f..2394579e53 100644
--- a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
@@ -71,7 +71,7 @@ tl::expected<TensorShape, std::string>
   std::vector<ff_dim_t> non_layer_norm_dim_idxs = filter(
       get_idxs(input_shape.dims.ff_ordered),
       [&](ff_dim_t const &dim_idx) { return !contains(attrs.axes, dim_idx); });
-  std::vector<size_t> raw_weight_dims =
+  std::vector<nonnegative_int> raw_weight_dims =
       transform(non_layer_norm_dim_idxs, [&](ff_dim_t const &dim_idx) {
         return dim_at_idx(input_shape,
                           relative_ff_dim_t_from_ff_dim_t(dim_idx));
@@ -174,8 +174,8 @@ tl::expected<ParallelTensorShape, std::string>
       ParallelTensorDims{
           ff_ordered_of(raw_weight_shard_dims),
           ReplicaParallelDimSet{
-              SumDegree{1},
-              DiscardCopyDegree{1},
+              SumDegree{1_n},
+              DiscardCopyDegree{1_n},
           },
       },
       DataType::FLOAT,
diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc
index e00a47d490..0387c143d7 100644
--- a/lib/op-attrs/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/src/op-attrs/ops/linear.cc
@@ -41,11 +41,11 @@ RecordFormatter as_dot(LinearAttrs const &attrs) {
 tl::expected<TensorShape, std::string>
     get_projection_shape(LinearAttrs const &attrs,
                          TensorShape const &input_shape) {
-  size_t in_channels = dim_at_idx(input_shape, relative_ff_dim_t{-1});
+  nonnegative_int in_channels = dim_at_idx(input_shape, relative_ff_dim_t{-1});
 
   return TensorShape{
       TensorDims{
-          FFOrdered<size_t>{in_channels, size_t_from_int(attrs.out_channels)},
+          FFOrdered<nonnegative_int>{in_channels, attrs.out_channels},
       },
       input_shape.data_type,
   };
@@ -55,7 +55,7 @@ tl::expected<TensorShape, std::string>
     get_bias_shape(LinearAttrs const &attrs, TensorShape const &input_shape) {
   return TensorShape{
       TensorDims{
-          FFOrdered<size_t>{size_t_from_int(attrs.out_channels)},
+          FFOrdered<nonnegative_int>{attrs.out_channels},
       },
       input_shape.data_type,
   };
@@ -64,8 +64,7 @@ tl::expected<TensorShape, std::string>
 tl::expected<TensorShape, std::string>
     get_output_shape(LinearAttrs const &attrs, TensorShape const &input_shape) {
   TensorShape output_shape = input_shape;
-  output_shape.dims.ff_ordered.at(relative_ff_dim_t{-1}) =
-      size_t_from_int(attrs.out_channels);
+  output_shape.dims.ff_ordered.at(relative_ff_dim_t{-1}) = attrs.out_channels;
 
   return output_shape;
 }
@@ -82,12 +81,12 @@ tl::expected<ParallelTensorShape, std::string>
     result_unpar.value();
   });
 
-  SumDegree sum_degree = SumDegree{1};
+  SumDegree sum_degree = SumDegree{1_n};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{
       get_sum_degree(input) * product(slice(ff_ordered_shard_degrees(input),
                                             std::nullopt,
                                             relative_ff_dim_t{-1}))};
-  FFOrdered<int> shard_degrees = FFOrdered<int>{
+  FFOrdered<nonnegative_int> shard_degrees = FFOrdered<nonnegative_int>{
       shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree,
       get_discard_copy_degree(input),
   };
@@ -112,7 +111,8 @@ tl::expected<ParallelTensorShape, std::string>
                 shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(slice(
       ff_ordered_shard_degrees(input), std::nullopt, relative_ff_dim_t{-1}))};
-  FFOrdered<int> shard_degrees = FFOrdered<int>{get_discard_copy_degree(input)};
+  FFOrdered<nonnegative_int> shard_degrees =
+      FFOrdered<nonnegative_int>{get_discard_copy_degree(input)};
 
   return lift_to_parallel_with_degrees(
       unpar, sum_degree, discard_copy_degree, shard_degrees);
@@ -133,8 +133,8 @@ tl::expected<ParallelTensorShape, std::string>
   SumDegree sum_degree =
       SumDegree{get_sum_degree(input) *
                 shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree};
-  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1};
-  FFOrdered<int> shard_degrees = ff_ordered_shard_degrees(input);
+  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_n};
+  FFOrdered<nonnegative_int> shard_degrees = ff_ordered_shard_degrees(input);
   shard_degrees.at(relative_ff_dim_t{-1}) = get_discard_copy_degree(input);
 
   return lift_to_parallel_with_degrees(
diff --git a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
index 86d287ebc8..f9630e16b1 100644
--- a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
+++ b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
@@ -8,8 +8,8 @@ namespace FlexFlow {
 
 tl::expected<Pool2DAttrs, std::string>
     make_adaptive_pool2d_attrs(TensorDims const &input_dims,
-                               int output_h,
-                               int output_w,
+                               nonnegative_int output_h,
+                               nonnegative_int output_w,
                                PoolOp pool_type,
                                std::optional<Activation> const &activation) {
   // AdaptivePool2D semantics pulled from
@@ -22,10 +22,10 @@ tl::expected<Pool2DAttrs, std::string>
                     input_dims));
   }
 
-  size_t num_samples = dim_at_idx(input_dims, relative_ff_dim_t{0});
-  size_t num_channels = dim_at_idx(input_dims, relative_ff_dim_t{1});
-  size_t input_h = dim_at_idx(input_dims, relative_ff_dim_t{2});
-  size_t input_w = dim_at_idx(input_dims, relative_ff_dim_t{3});
+  nonnegative_int num_samples = dim_at_idx(input_dims, relative_ff_dim_t{0});
+  nonnegative_int num_channels = dim_at_idx(input_dims, relative_ff_dim_t{1});
+  nonnegative_int input_h = dim_at_idx(input_dims, relative_ff_dim_t{2});
+  nonnegative_int input_w = dim_at_idx(input_dims, relative_ff_dim_t{3});
 
   if (input_h % output_h != 0) {
     return tl::unexpected(fmt::format(
@@ -55,29 +55,29 @@ tl::expected<Pool2DAttrs, std::string>
   //               = `ind / outd`
   //               = `stride`
 
-  int kernel_h = input_h / output_h;
-  int kernel_w = input_w / output_w;
+  nonnegative_int kernel_h = input_h / output_h;
+  nonnegative_int kernel_w = input_w / output_w;
 
-  int stride_h = kernel_h;
-  int stride_w = kernel_w;
+  nonnegative_int stride_h = kernel_h;
+  nonnegative_int stride_w = kernel_w;
 
   Pool2DAttrs attrs = Pool2DAttrs{
       /*kernel_h=*/kernel_h,
       /*kernel_w=*/kernel_w,
       /*stride_h=*/stride_h,
       /*stride_w=*/stride_w,
-      /*padding_h=*/0,
-      /*padding_w=*/0,
+      /*padding_h=*/0_n,
+      /*padding_w=*/0_n,
       /*pool_type=*/pool_type,
       /*activation=*/activation,
   };
 
   TensorShape expected_ouput_shape = TensorShape{
-      TensorDims{FFOrdered<size_t>{
+      TensorDims{FFOrdered<nonnegative_int>{
           num_samples,
           num_channels,
-          size_t_from_int(output_h),
-          size_t_from_int(output_w),
+          output_h,
+          output_w,
       }},
       DataType::FLOAT,
   };
@@ -104,6 +104,19 @@ tl::expected<Pool2DAttrs, std::string>
   return attrs;
 }
 
+static nonnegative_int calculate_output_size(nonnegative_int input_size,
+                                             nonnegative_int padding_size,
+                                             nonnegative_int kernel_size,
+                                             nonnegative_int stride) {
+  int input_size_raw = input_size.unwrap_nonnegative();
+  int padding_raw = padding_size.unwrap_nonnegative();
+  int kernel_size_raw = kernel_size.unwrap_nonnegative();
+  int stride_raw = stride.unwrap_nonnegative();
+
+  return nonnegative_int{
+      (input_size_raw + (2 * padding_raw) - kernel_size_raw) / stride_raw + 1};
+}
+
 tl::expected<TensorShape, std::string>
     get_output_shape(Pool2DAttrs const &attrs, TensorShape const &input_shape) {
   if (num_dims(input_shape) != 4) {
@@ -113,19 +126,23 @@ tl::expected<TensorShape, std::string>
                     input_shape));
   }
 
-  size_t num_samples = dim_at_idx(input_shape, relative_ff_dim_t{0});
-  size_t num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
-  size_t input_height = dim_at_idx(input_shape, relative_ff_dim_t{2});
-  size_t input_width = dim_at_idx(input_shape, relative_ff_dim_t{3});
-
-  size_t output_height =
-      (input_height + 2 * attrs.padding_h - attrs.kernel_h) / attrs.stride_h +
-      1;
-
-  size_t output_width =
-      (input_width + 2 * attrs.padding_w - attrs.kernel_w) / attrs.stride_w + 1;
-
-  return TensorShape{TensorDims{FFOrdered<size_t>{
+  nonnegative_int num_samples = dim_at_idx(input_shape, relative_ff_dim_t{0});
+  nonnegative_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
+  nonnegative_int input_height = dim_at_idx(input_shape, relative_ff_dim_t{2});
+  nonnegative_int input_width = dim_at_idx(input_shape, relative_ff_dim_t{3});
+
+  nonnegative_int output_height =
+      calculate_output_size(/*input_size=*/input_height,
+                            /*padding_size=*/attrs.padding_h,
+                            /*kernel_size=*/attrs.kernel_h,
+                            /*stride_size=*/attrs.stride_h);
+  nonnegative_int output_width =
+      calculate_output_size(/*input_size=*/input_width,
+                            /*padding_size=*/attrs.padding_w,
+                            /*kernel_size=*/attrs.kernel_w,
+                            /*stride_size=*/attrs.stride_w);
+
+  return TensorShape{TensorDims{FFOrdered<nonnegative_int>{
                          num_samples,
                          num_channels,
                          output_height,
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
index 0bb940924a..7a8f91e498 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
@@ -7,9 +7,11 @@
 #include "op-attrs/tensor_dims.h"
 #include "utils/containers/all_of.h"
 #include "utils/containers/product.h"
+#include "utils/containers/repeat_element.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/vector_of.h"
 #include "utils/integer_conversions.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
@@ -17,7 +19,8 @@ FFOrdered<ShardParallelDim> ff_ordered_shard_dims(ParallelTensorDims const &d) {
   return d.shard_dims;
 }
 
-FFOrdered<int> ff_ordered_shard_degrees(ParallelTensorDims const &d) {
+FFOrdered<nonnegative_int>
+    ff_ordered_shard_degrees(ParallelTensorDims const &d) {
   return transform(d.shard_dims,
                    [](ShardParallelDim const &d) { return d.degree; });
 }
@@ -27,8 +30,8 @@ std::unordered_set<ReplicaParallelDim>
   return get_replica_dims(d.replica_dims);
 }
 
-size_t num_shard_dims(ParallelTensorDims const &dims) {
-  return dims.shard_dims.size();
+nonnegative_int num_shard_dims(ParallelTensorDims const &dims) {
+  return num_elements(dims.shard_dims);
 }
 
 ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorDims const &d) {
@@ -40,22 +43,22 @@ ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorDims const &d) {
 }
 
 ParallelTensorDims lift_to_parallel(TensorDims const &dims) {
-  std::vector<int> shard_degrees(num_dims(dims),
-                                 1); // 1 repeated num_dims(dims) times
+  std::vector<nonnegative_int> shard_degrees =
+      repeat_element(/*num_times=*/num_dims(dims), /*element=*/1_n);
   return lift_to_parallel_with_degrees(
-      dims, SumDegree{1}, DiscardCopyDegree{1}, shard_degrees);
+      dims, SumDegree{1_n}, DiscardCopyDegree{1_n}, shard_degrees);
 }
 
-ParallelTensorDims
-    lift_to_parallel_with_degrees(TensorDims const &unpar,
-                                  SumDegree const &sum_degree,
-                                  DiscardCopyDegree const &discard_copy_degree,
-                                  FFOrdered<int> const &shard_degrees) {
+ParallelTensorDims lift_to_parallel_with_degrees(
+    TensorDims const &unpar,
+    SumDegree const &sum_degree,
+    DiscardCopyDegree const &discard_copy_degree,
+    FFOrdered<nonnegative_int> const &shard_degrees) {
   std::vector<ShardParallelDim> lifted =
       transform(zip(vector_of(unpar.ff_ordered), vector_of(shard_degrees)),
-                [](std::pair<size_t, int> const &p) {
-                  size_t size = p.first;
-                  int degree = p.second;
+                [](std::pair<nonnegative_int, nonnegative_int> const &p) {
+                  nonnegative_int size = p.first;
+                  nonnegative_int degree = p.second;
                   return ShardParallelDim{size, degree};
                 });
 
@@ -75,17 +78,17 @@ ParallelTensorDims
                                        degrees.shard_degrees);
 }
 
-int total_replica_degree(ParallelTensorDims const &dims) {
+nonnegative_int total_replica_degree(ParallelTensorDims const &dims) {
   return dims.replica_dims.discard_copy_degree.value *
          dims.replica_dims.sum_degree.value;
 }
 
-int total_shard_degree(ParallelTensorDims const &dims) {
+nonnegative_int total_shard_degree(ParallelTensorDims const &dims) {
   return product(transform(vector_of(dims.shard_dims),
                            [](ShardParallelDim const &d) { return d.degree; }));
 }
 
-int total_parallel_degree(ParallelTensorDims const &dims) {
+nonnegative_int total_parallel_degree(ParallelTensorDims const &dims) {
   return total_replica_degree(dims) * total_shard_degree(dims);
 }
 
@@ -115,7 +118,7 @@ TensorDims get_tensor_dims_unsafe(ParallelTensorDims const &) {
 }
 
 TensorDims get_reduced_dims(ParallelTensorDims const &dims) {
-  FFOrdered<size_t> dim_sizes = transform(
+  FFOrdered<nonnegative_int> dim_sizes = transform(
       dims.shard_dims, [](ShardParallelDim const &d) { return d.size; });
   return TensorDims{dim_sizes};
 }
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
index bbad13b46b..260ec7c3cd 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
@@ -6,11 +6,12 @@
 #include "utils/containers/range.h"
 #include "utils/containers/transform.h"
 #include "utils/hash-utils.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
 
-int num_shard_dims(ParallelTensorShape const &s) {
+nonnegative_int num_shard_dims(ParallelTensorShape const &s) {
   return num_shard_dims(s.dims);
 }
 
@@ -19,21 +20,21 @@ std::unordered_set<ReplicaParallelDim>
   return replica_dims(s.dims);
 }
 
-int get_num_replicas(ParallelTensorShape const &shape) {
-  return product(
-      transform(replica_dims(shape),
-                [](ReplicaParallelDim const &d) -> int { return d.degree; }));
+nonnegative_int get_num_replicas(ParallelTensorShape const &shape) {
+  return product(transform(
+      replica_dims(shape),
+      [](ReplicaParallelDim const &d) -> nonnegative_int { return d.degree; }));
 }
 
-int get_sum_degree(ParallelTensorShape const &shape) {
+nonnegative_int get_sum_degree(ParallelTensorShape const &shape) {
   return shape.dims.replica_dims.sum_degree.value;
 }
 
-int get_discard_copy_degree(ParallelTensorShape const &shape) {
+nonnegative_int get_discard_copy_degree(ParallelTensorShape const &shape) {
   return shape.dims.replica_dims.discard_copy_degree.value;
 }
 
-int get_total_parallel_degree(ParallelTensorShape const &s) {
+nonnegative_int get_total_parallel_degree(ParallelTensorShape const &s) {
   return total_parallel_degree(s.dims);
 }
 
@@ -51,7 +52,8 @@ ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &s,
   return shard_dim_at_idx(s.dims, d);
 }
 
-FFOrdered<int> ff_ordered_shard_degrees(ParallelTensorShape const &s) {
+FFOrdered<nonnegative_int>
+    ff_ordered_shard_degrees(ParallelTensorShape const &s) {
   return ff_ordered_shard_degrees(s.dims);
 }
 
@@ -73,11 +75,11 @@ ParallelTensorShape lift_to_parallel(TensorShape const &s) {
   return ParallelTensorShape{lift_to_parallel(s.dims), s.data_type};
 }
 
-ParallelTensorShape
-    lift_to_parallel_with_degrees(TensorShape const &unpar,
-                                  SumDegree const &sum_degree,
-                                  DiscardCopyDegree const &discard_copy_degree,
-                                  FFOrdered<int> const &shard_degrees) {
+ParallelTensorShape lift_to_parallel_with_degrees(
+    TensorShape const &unpar,
+    SumDegree const &sum_degree,
+    DiscardCopyDegree const &discard_copy_degree,
+    FFOrdered<nonnegative_int> const &shard_degrees) {
   return ParallelTensorShape{
       lift_to_parallel_with_degrees(
           unpar.dims, sum_degree, discard_copy_degree, shard_degrees),
@@ -95,8 +97,8 @@ ParallelTensorShape
 }
 
 TensorShape require_not_parallel(ParallelTensorShape const &s) {
-  int total_degree = get_total_parallel_degree(s);
-  if (total_degree != 1) {
+  nonnegative_int total_degree = get_total_parallel_degree(s);
+  if (total_degree != 1_n) {
     throw mk_runtime_error(
         fmt::format("Error: require_not_parallel received a parallel tensor "
                     "shape with parallel degree {}: {}",
@@ -124,25 +126,27 @@ TensorShape get_reduced_shape(ParallelTensorShape const &s) {
 
 ParallelDim get_parallel_dim_at_idx(ParallelTensorShape const &shape,
                                     parallel_tensor_dim_idx_t idx) {
-  return idx.visit<ParallelDim>(
-      overload{[&](ff_dim_t shard_dim) {
-                 return ParallelDim{shape.dims.shard_dims.at(shard_dim)};
-               },
-               [&](ReplicaType replica_type) {
-                 ReplicaParallelDimSet replicas = shape.dims.replica_dims;
-                 int degree = (ReplicaType::SUM == replica_type
-                                   ? replicas.sum_degree.value
-                                   : replicas.discard_copy_degree.value);
-                 return ParallelDim{ReplicaParallelDim{degree, replica_type}};
-               }});
+  return idx.visit<ParallelDim>(overload{
+      [&](ff_dim_t shard_dim) {
+        return ParallelDim{shape.dims.shard_dims.at(shard_dim)};
+      },
+      [&](ReplicaType replica_type) {
+        ReplicaParallelDimSet replicas = shape.dims.replica_dims;
+        nonnegative_int degree = (ReplicaType::SUM == replica_type
+                                      ? replicas.sum_degree.value
+                                      : replicas.discard_copy_degree.value);
+        return ParallelDim{ReplicaParallelDim{degree, replica_type}};
+      }});
 }
 
 std::unordered_set<parallel_tensor_dim_idx_t>
     get_parallel_tensor_dim_indices(ParallelTensorShape const &shape) {
   std::unordered_set<parallel_tensor_dim_idx_t> indices;
-  extend(indices, transform(range(num_shard_dims(shape.dims)), [](int idx) {
-           return parallel_tensor_dim_idx_t{ff_dim_t{nonnegative_int{idx}}};
-         }));
+  extend(indices,
+         transform(nonnegative_range(num_shard_dims(shape.dims)),
+                   [](nonnegative_int idx) {
+                     return parallel_tensor_dim_idx_t{ff_dim_t{idx}};
+                   }));
   indices.insert(parallel_tensor_dim_idx_t{ReplicaType::SUM});
   indices.insert(parallel_tensor_dim_idx_t{ReplicaType::DISCARD_COPY});
   return indices;
diff --git a/lib/op-attrs/src/op-attrs/relative_ff_dim_t.cc b/lib/op-attrs/src/op-attrs/relative_ff_dim_t.cc
index 0671bb05f2..a987841b18 100644
--- a/lib/op-attrs/src/op-attrs/relative_ff_dim_t.cc
+++ b/lib/op-attrs/src/op-attrs/relative_ff_dim_t.cc
@@ -3,10 +3,10 @@
 
 namespace FlexFlow {
 ff_dim_t ff_dim_t_from_relative_ff_dim_t(relative_ff_dim_t ff_dim,
-                                         int input_dim) {
+                                         nonnegative_int input_dim) {
   int raw = ff_dim.value;
   if (raw < 0) {
-    raw = input_dim + raw;
+    raw = input_dim.unwrap_nonnegative() + raw;
   }
   return ff_dim_t{nonnegative_int{raw}};
 }
diff --git a/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc b/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc
index 20c88c77dc..fc712be10b 100644
--- a/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc
+++ b/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc
@@ -4,11 +4,11 @@
 namespace FlexFlow {
 
 ReplicaParallelDimSet empty_replica_parallel_dim_set() {
-  return ReplicaParallelDimSet{SumDegree{1}, DiscardCopyDegree{1}};
+  return ReplicaParallelDimSet{SumDegree{1_n}, DiscardCopyDegree{1_n}};
 }
 
-int get_order_of_replica_type(ReplicaParallelDimSet const &s,
-                              ReplicaType replica_type) {
+nonnegative_int get_degree_of_replica_type(ReplicaParallelDimSet const &s,
+                                           ReplicaType replica_type) {
   switch (replica_type) {
     case ReplicaType::SUM:
       return s.sum_degree.value;
diff --git a/lib/op-attrs/src/op-attrs/tensor_dims.cc b/lib/op-attrs/src/op-attrs/tensor_dims.cc
index f0ac88d8e4..f9198bbe28 100644
--- a/lib/op-attrs/src/op-attrs/tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_dims.cc
@@ -8,22 +8,23 @@
 #include "utils/containers/vector_of.h"
 #include "utils/containers/zip.h"
 #include "utils/integer_conversions.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
-FFOrdered<size_t> const &ff_ordered(TensorDims const &dims) {
+FFOrdered<nonnegative_int> const &ff_ordered(TensorDims const &dims) {
   return dims.ff_ordered;
 }
 
-size_t num_dims(TensorDims const &dims) {
-  return dims.ff_ordered.size();
+nonnegative_int num_dims(TensorDims const &dims) {
+  return num_elements(dims.ff_ordered);
 }
 
-size_t dim_at_idx(TensorDims const &dims, relative_ff_dim_t idx) {
+nonnegative_int dim_at_idx(TensorDims const &dims, relative_ff_dim_t idx) {
   return dims.ff_ordered.at(idx);
 }
 
-size_t &dim_at_idx(TensorDims &dims, relative_ff_dim_t idx) {
+nonnegative_int &dim_at_idx(TensorDims &dims, relative_ff_dim_t idx) {
   return dims.ff_ordered.at(idx);
 }
 
@@ -33,8 +34,8 @@ bool tensor_dims_is_broadcastable_to(TensorDims const &curr,
     return false;
   }
 
-  std::vector<size_t> curr_dims = vector_of(curr.ff_ordered);
-  std::vector<size_t> goal_dims = vector_of(goal.ff_ordered);
+  std::vector<nonnegative_int> curr_dims = vector_of(curr.ff_ordered);
+  std::vector<nonnegative_int> goal_dims = vector_of(goal.ff_ordered);
 
   for (auto const &[curr_dim, goal_dim] :
        zip(reversed(curr_dims), reversed(goal_dims))) {
diff --git a/lib/op-attrs/src/op-attrs/tensor_shape.cc b/lib/op-attrs/src/op-attrs/tensor_shape.cc
index 70ed58aac6..690a07d26a 100644
--- a/lib/op-attrs/src/op-attrs/tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_shape.cc
@@ -4,26 +4,27 @@
 #include "utils/containers/get_only.h"
 #include "utils/containers/product.h"
 #include "utils/containers/transform.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
-size_t num_dims(TensorShape const &s) {
-  return s.dims.ff_ordered.size();
+nonnegative_int num_dims(TensorShape const &s) {
+  return num_elements(s.dims.ff_ordered);
 }
 
-size_t dim_at_idx(TensorShape const &s, relative_ff_dim_t idx) {
+nonnegative_int dim_at_idx(TensorShape const &s, relative_ff_dim_t idx) {
   return dim_at_idx(s.dims, idx);
 }
 
-size_t &dim_at_idx(TensorShape &s, relative_ff_dim_t idx) {
+nonnegative_int &dim_at_idx(TensorShape &s, relative_ff_dim_t idx) {
   return dim_at_idx(s.dims, idx);
 }
 
-size_t get_num_elements(TensorShape const &s) {
+nonnegative_int get_num_elements(TensorShape const &s) {
   return product(s.dims.ff_ordered);
 }
 
-size_t get_size_in_bytes(TensorShape const &s) {
+nonnegative_int get_size_in_bytes(TensorShape const &s) {
   return get_num_elements(s) * size_of_datatype(s.data_type);
 }
 
diff --git a/lib/op-attrs/test/src/op-attrs/ops/attention.cc b/lib/op-attrs/test/src/op-attrs/ops/attention.cc
index eca8559b21..b317c5c69c 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/attention.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/attention.cc
@@ -10,10 +10,10 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_attention_incoming_tensor_roles(MultiHeadAttentionAttrs)") {
     auto make_attrs = [](bool bias) {
       return MultiHeadAttentionAttrs{
-          /*embed_dim=*/32,
-          /*num_heads=*/10,
-          /*kdim=*/32,
-          /*vdim=*/32,
+          /*embed_dim=*/32_n,
+          /*num_heads=*/10_n,
+          /*kdim=*/32_n,
+          /*vdim=*/32_n,
           /*dropout=*/0.0,
           /*bias=*/bias,
           /*add_bias_kv=*/false,
@@ -58,8 +58,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("get_output_shape(MultiHeadAttentionAttrs, TensorShape, "
             "TensorShape, TensorShape)") {
-    int embed_dim = 32;
-    int num_heads = 10;
+    nonnegative_int embed_dim = 32_n;
+    nonnegative_int num_heads = 10_n;
 
     /* Parameter meanings match those at
      * https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html
@@ -75,13 +75,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*add_zero_attn=*/false,
     };
 
-    size_t batch_size = 40;
-    size_t seq_len = 48;
-    size_t feature_size = 36;
+    nonnegative_int batch_size = 40_n;
+    nonnegative_int seq_len = 48_n;
+    nonnegative_int feature_size = 36_n;
 
     TensorShape input_q = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 batch_size,
                 seq_len,
                 feature_size,
@@ -92,7 +92,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_k = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 batch_size,
                 seq_len,
                 feature_size,
@@ -103,7 +103,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_v = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 batch_size,
                 seq_len,
                 feature_size,
@@ -114,10 +114,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape output = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 batch_size,
                 seq_len,
-                size_t_from_int(attrs.embed_dim),
+                attrs.embed_dim,
             },
         },
         DataType::FLOAT,
@@ -125,9 +125,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape weights = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
-                (feature_size * embed_dim) * 3 + (embed_dim * embed_dim),
-                size_t_from_int(num_heads),
+            FFOrdered<nonnegative_int>{
+                (feature_size * embed_dim) * 3_n + (embed_dim * embed_dim),
+                num_heads,
             },
         },
         DataType::FLOAT,
@@ -135,8 +135,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_bias = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
-                size_t_from_int(embed_dim * 3),
+            FFOrdered<nonnegative_int>{
+                embed_dim * 3_n,
             },
         },
         DataType::FLOAT,
@@ -144,8 +144,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape output_bias = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
-                size_t_from_int(embed_dim),
+            FFOrdered<nonnegative_int>{
+                embed_dim,
             },
         },
         DataType::FLOAT,
@@ -184,72 +184,94 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("parallel shape inference") {
       auto make_q = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_batch,
-                        int o_seq_len,
-                        int o_q) {
+                        nonnegative_int o_batch,
+                        nonnegative_int o_seq_len,
+                        nonnegative_int o_q) {
         return lift_to_parallel_with_degrees(
-            input_q, o_sum, o_eq, FFOrdered<int>{o_batch, o_seq_len, o_q});
+            input_q,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_batch, o_seq_len, o_q});
       };
 
       auto make_k = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_batch,
-                        int o_seq_len,
-                        int o_k) {
+                        nonnegative_int o_batch,
+                        nonnegative_int o_seq_len,
+                        nonnegative_int o_k) {
         return lift_to_parallel_with_degrees(
-            input_k, o_sum, o_eq, FFOrdered<int>{o_batch, o_seq_len, o_k});
+            input_k,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_batch, o_seq_len, o_k});
       };
 
       auto make_v = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_batch,
-                        int o_seq_len,
-                        int o_v) {
+                        nonnegative_int o_batch,
+                        nonnegative_int o_seq_len,
+                        nonnegative_int o_v) {
         return lift_to_parallel_with_degrees(
-            input_v, o_sum, o_eq, FFOrdered<int>{o_batch, o_seq_len, o_v});
+            input_v,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_batch, o_seq_len, o_v});
       };
 
       auto make_o = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_batch,
-                        int o_seq_len,
-                        int o_o) {
+                        nonnegative_int o_batch,
+                        nonnegative_int o_seq_len,
+                        nonnegative_int o_o) {
         return lift_to_parallel_with_degrees(
-            output, o_sum, o_eq, FFOrdered<int>{o_batch, o_seq_len, o_o});
+            output,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_batch, o_seq_len, o_o});
       };
 
-      auto make_w =
-          [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o_e, int o_h) {
-            return lift_to_parallel_with_degrees(
-                weights, o_sum, o_eq, FFOrdered<int>{o_e, o_h});
-          };
+      auto make_w = [&](SumDegree o_sum,
+                        DiscardCopyDegree o_eq,
+                        nonnegative_int o_e,
+                        nonnegative_int o_h) {
+        return lift_to_parallel_with_degrees(
+            weights, o_sum, o_eq, FFOrdered<nonnegative_int>{o_e, o_h});
+      };
 
-      auto make_input_bias =
-          [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o_in_proj_channel) {
-            return lift_to_parallel_with_degrees(
-                input_bias, o_sum, o_eq, FFOrdered<int>{o_in_proj_channel});
-          };
+      auto make_input_bias = [&](SumDegree o_sum,
+                                 DiscardCopyDegree o_eq,
+                                 nonnegative_int o_in_proj_channel) {
+        return lift_to_parallel_with_degrees(
+            input_bias,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_in_proj_channel});
+      };
 
-      auto make_output_bias =
-          [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o_out_proj_channel) {
-            return lift_to_parallel_with_degrees(
-                output_bias, o_sum, o_eq, FFOrdered<int>{o_out_proj_channel});
-          };
+      auto make_output_bias = [&](SumDegree o_sum,
+                                  DiscardCopyDegree o_eq,
+                                  nonnegative_int o_out_proj_channel) {
+        return lift_to_parallel_with_degrees(
+            output_bias,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_out_proj_channel});
+      };
 
       SUBCASE("data parallelism") {
-        int o_b = 4;
+        nonnegative_int o_b = 4_n;
         ParallelTensorShape q =
-            make_q(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1);
+            make_q(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
         ParallelTensorShape k =
-            make_k(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1);
+            make_k(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
         ParallelTensorShape v =
-            make_v(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1);
+            make_v(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
 
         SUBCASE("get_output_shape") {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_o(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1);
+              make_o(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
           CHECK(result == correct);
         }
 
@@ -257,7 +279,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_weights_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_w(SumDegree{1}, DiscardCopyDegree{o_b}, 1, 1);
+              make_w(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n, 1_n);
           CHECK(result == correct);
         }
 
@@ -265,7 +287,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_input_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_input_bias(SumDegree{1}, DiscardCopyDegree{o_b}, 1);
+              make_input_bias(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n);
           CHECK(result == correct);
         }
 
@@ -273,25 +295,25 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_output_bias(SumDegree{1}, DiscardCopyDegree{o_b}, 1);
+              make_output_bias(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n);
           CHECK(result == correct);
         }
       }
 
       SUBCASE("attention head parallelism") {
-        int o_h = 2;
+        nonnegative_int o_h = 2_n;
         ParallelTensorShape q =
-            make_q(SumDegree{1}, DiscardCopyDegree{o_h}, 1, 1, 1);
+            make_q(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n, 1_n, 1_n);
         ParallelTensorShape k =
-            make_k(SumDegree{1}, DiscardCopyDegree{o_h}, 1, 1, 1);
+            make_k(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n, 1_n, 1_n);
         ParallelTensorShape v =
-            make_v(SumDegree{1}, DiscardCopyDegree{o_h}, 1, 1, 1);
+            make_v(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n, 1_n, 1_n);
 
         SUBCASE("get_output_shape") {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_o(SumDegree{o_h}, DiscardCopyDegree{1}, 1, 1, 1);
+              make_o(SumDegree{o_h}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
           CHECK(result == correct);
         }
 
@@ -299,7 +321,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_weights_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_w(SumDegree{1}, DiscardCopyDegree{1}, 1, o_h);
+              make_w(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_h);
           CHECK(result == correct);
         }
 
@@ -307,7 +329,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_input_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_input_bias(SumDegree{1}, DiscardCopyDegree{o_h}, 1);
+              make_input_bias(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n);
           CHECK(result == correct);
         }
 
@@ -315,26 +337,26 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_output_bias(SumDegree{1}, DiscardCopyDegree{o_h}, 1);
+              make_output_bias(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n);
           CHECK(result == correct);
         }
       }
 
       SUBCASE("combined data & attention head parallelism") {
-        int o_b = 4;
-        int o_h = 2;
+        nonnegative_int o_b = 4_n;
+        nonnegative_int o_h = 2_n;
         ParallelTensorShape q =
-            make_q(SumDegree{1}, DiscardCopyDegree{o_h}, o_b, 1, 1);
+            make_q(SumDegree{1_n}, DiscardCopyDegree{o_h}, o_b, 1_n, 1_n);
         ParallelTensorShape k =
-            make_k(SumDegree{1}, DiscardCopyDegree{o_h}, o_b, 1, 1);
+            make_k(SumDegree{1_n}, DiscardCopyDegree{o_h}, o_b, 1_n, 1_n);
         ParallelTensorShape v =
-            make_v(SumDegree{1}, DiscardCopyDegree{o_h}, o_b, 1, 1);
+            make_v(SumDegree{1_n}, DiscardCopyDegree{o_h}, o_b, 1_n, 1_n);
 
         SUBCASE("get_output_shape") {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_o(SumDegree{o_h}, DiscardCopyDegree{1}, o_b, 1, 1);
+              make_o(SumDegree{o_h}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
           CHECK(result == correct);
         }
 
@@ -342,7 +364,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_weights_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_w(SumDegree{1}, DiscardCopyDegree{o_b}, 1, o_h);
+              make_w(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n, o_h);
           CHECK(result == correct);
         }
 
@@ -350,7 +372,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_input_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_input_bias(SumDegree{1}, DiscardCopyDegree{o_b * o_h}, 1);
+              make_input_bias(
+                  SumDegree{1_n}, DiscardCopyDegree{o_b * o_h}, 1_n);
           CHECK(result == correct);
         }
 
@@ -358,7 +381,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_output_bias(SumDegree{1}, DiscardCopyDegree{o_b * o_h}, 1);
+              make_output_bias(
+                  SumDegree{1_n}, DiscardCopyDegree{o_b * o_h}, 1_n);
           CHECK(result == correct);
         }
       }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc b/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc
index 56a2e3fa52..27c59ee497 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc
@@ -6,20 +6,20 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_output_shape(BatchMatmulAttrs, TensorShape)") {
-    size_t b = 4;
-    size_t m = 6;
-    size_t n = 8;
-    size_t p = 10;
+    nonnegative_int b = 4_n;
+    nonnegative_int m = 6_n;
+    nonnegative_int n = 8_n;
+    nonnegative_int p = 10_n;
 
     BatchMatmulAttrs attrs = BatchMatmulAttrs{
-        /*a_seq_length_dim=*/0, // TODO figure out if these arguments are still
-                                // relevant
-        /*b_seq_length_dim=*/0,
+        /*a_seq_length_dim=*/0_n, // TODO figure out if these arguments are
+                                  // still relevant
+        /*b_seq_length_dim=*/0_n,
     };
 
     TensorShape input_lhs_shape = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 b,
                 n,
                 m,
@@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("valid") {
       TensorShape input_rhs_shape = TensorShape{
           TensorDims{
-              FFOrdered<size_t>{
+              FFOrdered<nonnegative_int>{
                   b,
                   m,
                   p,
@@ -45,7 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       tl::expected<TensorShape, std::string> correct_output_shape = TensorShape{
           TensorDims{
-              FFOrdered<size_t>{
+              FFOrdered<nonnegative_int>{
                   b,
                   n,
                   p,
@@ -60,8 +60,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("mismatched b") {
       TensorShape input_rhs_shape = TensorShape{
           TensorDims{
-              FFOrdered<size_t>{
-                  b + 1,
+              FFOrdered<nonnegative_int>{
+                  b + 1_n,
                   m,
                   p,
               },
@@ -78,9 +78,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("mismatched m") {
       TensorShape input_rhs_shape = TensorShape{
           TensorDims{
-              FFOrdered<size_t>{
+              FFOrdered<nonnegative_int>{
                   b,
-                  m + 1,
+                  m + 1_n,
                   p,
               },
           },
@@ -95,27 +95,27 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("get_output_shape(BatchMatmulAttrs, ParallelTensorShape)") {
-    size_t b = 2 * 2;
-    int o_b = 2;
-    size_t m = 3 * 3;
-    int o_m = 3;
-    size_t n = 5 * 5;
-    int o_n = 5;
-    size_t p = 7 * 7;
-    int o_p = 7;
-    int o_sum = 11;
+    nonnegative_int b = 2_n * 2_n;
+    nonnegative_int o_b = 2_n;
+    nonnegative_int m = 3_n * 3_n;
+    nonnegative_int o_m = 3_n;
+    nonnegative_int n = 5_n * 5_n;
+    nonnegative_int o_n = 5_n;
+    nonnegative_int p = 7_n * 7_n;
+    nonnegative_int o_p = 7_n;
+    nonnegative_int o_sum = 11_n;
 
     BatchMatmulAttrs attrs = BatchMatmulAttrs{
-        /*a_seq_length_dim=*/0, // TODO figure out if these arguments are still
-                                // relevant
-        /*b_seq_length_dim=*/0,
+        /*a_seq_length_dim=*/0_n, // TODO figure out if these arguments are
+                                  // still relevant
+        /*b_seq_length_dim=*/0_n,
     };
 
     auto make_lhs = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_b,
-                        int o_n,
-                        int o_m) {
+                        nonnegative_int o_b,
+                        nonnegative_int o_n,
+                        nonnegative_int o_m) {
       return ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
@@ -134,9 +134,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_rhs = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_b,
-                        int o_m,
-                        int o_p) {
+                        nonnegative_int o_b,
+                        nonnegative_int o_m,
+                        nonnegative_int o_p) {
       return ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
@@ -155,9 +155,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           int o_b,
-                           int o_n,
-                           int o_p) {
+                           nonnegative_int o_b,
+                           nonnegative_int o_n,
+                           nonnegative_int o_p) {
       return ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
@@ -177,10 +177,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("data parallel") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1),
-          make_rhs(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1));
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n),
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1);
+          make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -188,10 +188,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("n parallel") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{1}, DiscardCopyDegree{1}, 1, o_n, 1),
-          make_rhs(SumDegree{1}, DiscardCopyDegree{o_n}, 1, 1, 1));
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n),
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{o_n}, 1_n, 1_n, 1_n));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{1}, DiscardCopyDegree{1}, 1, o_n, 1);
+          make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -199,10 +199,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("p parallel") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{1}, DiscardCopyDegree{o_p}, 1, 1, 1),
-          make_rhs(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, o_p));
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{o_p}, 1_n, 1_n, 1_n),
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, o_p));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, o_p);
+          make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, o_p);
 
       CHECK(result == correct);
     }
@@ -210,10 +210,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction parallel") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, o_m),
-          make_rhs(SumDegree{1}, DiscardCopyDegree{1}, 1, o_m, 1));
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, o_m),
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_m, 1_n));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{o_m}, DiscardCopyDegree{1}, 1, 1, 1);
+          make_output(SumDegree{o_m}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -221,10 +221,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("propagate reduction lhs") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1),
-          make_rhs(SumDegree{1}, DiscardCopyDegree{o_sum}, 1, 1, 1));
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1);
+          make_output(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -232,10 +232,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("propagate reduction rhs") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{1}, DiscardCopyDegree{o_sum}, 1, 1, 1),
-          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1));
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n),
+          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1);
+          make_output(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -243,10 +243,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction lhs & reduction rhs") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1, 1, 1),
-          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1, 1, 1));
-      tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{o_sum * o_sum}, DiscardCopyDegree{1}, 1, 1, 1);
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n),
+          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n));
+      tl::expected<ParallelTensorShape, std::string> correct = make_output(
+          SumDegree{o_sum * o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -254,8 +254,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction lhs & rhs (invalid)") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1),
-          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1));
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n));
 
       CHECK_MESSAGE(
           !result.has_value(), "Unexpected successful value: ", result);
@@ -264,10 +264,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction lhs & n") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, o_n, 1),
-          make_rhs(SumDegree{1}, DiscardCopyDegree{o_sum * o_n}, 1, 1, 1));
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n),
+          make_rhs(
+              SumDegree{1_n}, DiscardCopyDegree{o_sum * o_n}, 1_n, 1_n, 1_n));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, o_n, 1);
+          make_output(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -275,10 +276,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction lhs & reduction rhs & n") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1, o_n, 1),
-          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1, 1, 1));
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, o_n, 1_n),
+          make_rhs(
+              SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1_n, 1_n, 1_n));
       tl::expected<ParallelTensorShape, std::string> correct = make_output(
-          SumDegree{o_sum * o_sum}, DiscardCopyDegree{1}, 1, o_n, 1);
+          SumDegree{o_sum * o_sum}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -286,11 +288,15 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction lhs & reduction rhs & n & m") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1, o_n, o_m),
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, o_n, o_m),
           make_rhs(
-              SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1, o_m, 1));
-      tl::expected<ParallelTensorShape, std::string> correct = make_output(
-          SumDegree{o_sum * o_sum * o_m}, DiscardCopyDegree{1}, 1, o_n, 1);
+              SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1_n, o_m, 1_n));
+      tl::expected<ParallelTensorShape, std::string> correct =
+          make_output(SumDegree{o_sum * o_sum * o_m},
+                      DiscardCopyDegree{1_n},
+                      1_n,
+                      o_n,
+                      1_n);
 
       CHECK(result == correct);
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc
index 4196394d00..cd9796945c 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc
@@ -60,11 +60,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }();
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            12,
-            14,
-            16,
-            18,
+        TensorDims{FFOrdered<nonnegative_int>{
+            12_n,
+            14_n,
+            16_n,
+            18_n,
         }},
         DataType::FLOAT,
     };
@@ -72,8 +72,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape output = input;
 
     TensorShape gamma = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            14,
+        TensorDims{FFOrdered<nonnegative_int>{
+            14_n,
         }},
         DataType::FLOAT,
     };
@@ -140,16 +140,16 @@ TEST_SUITE(FF_TEST_SUITE) {
     }();
 
     SUBCASE("partition parallelism (in channel dim)") {
-      int degree = 2;
+      nonnegative_int degree = 2_n;
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{1},
-          FFOrdered<int>{
-              1,
+          SumDegree{1_n},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{
+              1_n,
               degree,
-              1,
-              1,
+              1_n,
+              1_n,
           },
       };
 
@@ -169,9 +169,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               get_gamma_weights_parallel_dim_degrees(attrs_affine_true, input);
           tl::expected<ParallelTensorDimDegrees, std::string> correct =
               ParallelTensorDimDegrees{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
-                  FFOrdered<int>{degree},
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
+                  FFOrdered<nonnegative_int>{degree},
               };
 
           CHECK(result == correct);
@@ -194,9 +194,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               get_beta_weights_parallel_dim_degrees(attrs_affine_true, input);
           tl::expected<ParallelTensorDimDegrees, std::string> correct =
               ParallelTensorDimDegrees{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
-                  FFOrdered<int>{degree},
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
+                  FFOrdered<nonnegative_int>{degree},
               };
 
           CHECK(result == correct);
@@ -214,12 +214,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("partition parallelism (not in channel dim)") {
-      int degree = 2;
+      nonnegative_int degree = 2_n;
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{1},
-          FFOrdered<int>{1, 1, degree, 1},
+          SumDegree{1_n},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{1_n, 1_n, degree, 1_n},
       };
 
       SUBCASE("get_output_parallel_dim_degrees(BatchNormAttrs, "
@@ -251,12 +251,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("sum parallelism") {
-      SumDegree sum_degree = SumDegree{2};
+      SumDegree sum_degree = SumDegree{2_n};
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
           sum_degree,
-          DiscardCopyDegree{1},
-          FFOrdered<int>{1, 1, 1, 1},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 1_n},
       };
 
       SUBCASE("get_output_parallel_dim_degrees(BatchNormAttrs, "
@@ -288,12 +288,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("discard copy parallelism") {
-      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2};
+      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n};
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
+          SumDegree{1_n},
           discard_copy_degree,
-          FFOrdered<int>{1, 1, 1, 1},
+          FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 1_n},
       };
 
       SUBCASE("get_output_parallel_dim_degrees(BatchNormAttrs, "
@@ -340,14 +340,14 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{12, 1},
-                ShardParallelDim{14, 2},
-                ShardParallelDim{16, 1},
-                ShardParallelDim{18, 1},
+                ShardParallelDim{12_n, 1_n},
+                ShardParallelDim{14_n, 2_n},
+                ShardParallelDim{16_n, 1_n},
+                ShardParallelDim{18_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -368,11 +368,11 @@ TEST_SUITE(FF_TEST_SUITE) {
           ParallelTensorShape{
               ParallelTensorDims{
                   FFOrdered<ShardParallelDim>{
-                      ShardParallelDim{14, 2},
+                      ShardParallelDim{14_n, 2_n},
                   },
                   ReplicaParallelDimSet{
-                      SumDegree{1},
-                      DiscardCopyDegree{1},
+                      SumDegree{1_n},
+                      DiscardCopyDegree{1_n},
                   },
               },
               DataType::FLOAT,
@@ -388,11 +388,11 @@ TEST_SUITE(FF_TEST_SUITE) {
           ParallelTensorShape{
               ParallelTensorDims{
                   FFOrdered<ShardParallelDim>{
-                      ShardParallelDim{14, 2},
+                      ShardParallelDim{14_n, 2_n},
                   },
                   ReplicaParallelDimSet{
-                      SumDegree{1},
-                      DiscardCopyDegree{1},
+                      SumDegree{1_n},
+                      DiscardCopyDegree{1_n},
                   },
               },
               DataType::FLOAT,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/cast.cc b/lib/op-attrs/test/src/op-attrs/ops/cast.cc
index c7395316ad..e9ec890b4b 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/cast.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/cast.cc
@@ -12,15 +12,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     CastAttrs attrs = CastAttrs{output_datatype};
 
-    size_t d1 = 12;
-    size_t d2 = 16;
+    nonnegative_int d1 = 12_n;
+    nonnegative_int d2 = 16_n;
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{d1, d2}},
+        TensorDims{FFOrdered<nonnegative_int>{d1, d2}},
         input_datatype,
     };
 
     TensorShape output = TensorShape{
-        TensorDims{FFOrdered<size_t>{d1, d2}},
+        TensorDims{FFOrdered<nonnegative_int>{d1, d2}},
         output_datatype,
     };
 
@@ -34,24 +34,30 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("get_output_shape(CastAttrs, ParallelTensorShape)") {
       auto make_input = [&](SumDegree o_sum,
                             DiscardCopyDegree o_eq,
-                            int o_batch,
-                            int o_features) {
+                            nonnegative_int o_batch,
+                            nonnegative_int o_features) {
         return lift_to_parallel_with_degrees(
-            input, o_sum, o_eq, FFOrdered<int>{o_batch, o_features});
+            input,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_batch, o_features});
       };
 
       auto make_output = [&](SumDegree o_sum,
                              DiscardCopyDegree o_eq,
-                             int o_batch,
-                             int o_outchannels) {
+                             nonnegative_int o_batch,
+                             nonnegative_int o_outchannels) {
         return lift_to_parallel_with_degrees(
-            output, o_sum, o_eq, FFOrdered<int>{o_batch, o_outchannels});
+            output,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_batch, o_outchannels});
       };
 
-      SumDegree sum_degree = SumDegree{2};
-      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{3};
-      int batch_degree = 4;
-      int feature_degree = 8;
+      SumDegree sum_degree = SumDegree{2_n};
+      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{3_n};
+      nonnegative_int batch_degree = 4_n;
+      nonnegative_int feature_degree = 8_n;
       ParallelTensorShape par_input = make_input(
           sum_degree, discard_copy_degree, batch_degree, feature_degree);
 
diff --git a/lib/op-attrs/test/src/op-attrs/ops/combine.cc b/lib/op-attrs/test/src/op-attrs/ops/combine.cc
index 577961b7b1..14fbca5b3a 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/combine.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/combine.cc
@@ -10,22 +10,22 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{12, 2},
-                ShardParallelDim{14, 1},
-                ShardParallelDim{16, 3},
-                ShardParallelDim{18, 2},
+                ShardParallelDim{12_n, 2_n},
+                ShardParallelDim{14_n, 1_n},
+                ShardParallelDim{16_n, 3_n},
+                ShardParallelDim{18_n, 2_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{3},
-                DiscardCopyDegree{2},
+                SumDegree{3_n},
+                DiscardCopyDegree{2_n},
             },
         },
         DataType::FLOAT,
     };
 
     SUBCASE("valid") {
-      ff_dim_t dim = ff_dim_t{nonnegative_int{2}};
-      int degree = 3;
+      ff_dim_t dim = ff_dim_t{2_n};
+      nonnegative_int degree = 3_n;
       CombineAttrs attrs = CombineAttrs{
           /*repartition_dim=*/dim,
           /*repartition_degree=*/degree,
@@ -44,8 +44,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("invalid") {
-      ff_dim_t dim = ff_dim_t{nonnegative_int{2}};
-      int degree = 4;
+      ff_dim_t dim = ff_dim_t{2_n};
+      nonnegative_int degree = 4_n;
       CombineAttrs attrs = CombineAttrs{
           /*repartition_dim=*/dim,
           /*repartition_degree=*/degree,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/concat.cc b/lib/op-attrs/test/src/op-attrs/ops/concat.cc
index 2d9842b1dd..b84cf38753 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/concat.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/concat.cc
@@ -23,12 +23,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
 
-    size_t dim0_size = 12;
-    size_t dim2_size = 20;
+    nonnegative_int dim0_size = 12_n;
+    nonnegative_int dim2_size = 20_n;
     TensorShape input_shape1 = TensorShape{
-        TensorDims{FFOrdered<size_t>{
+        TensorDims{FFOrdered<nonnegative_int>{
             dim0_size,
-            14,
+            14_n,
             dim2_size,
         }},
         DataType::FLOAT,
@@ -45,26 +45,26 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     TensorShape input_shape2 = TensorShape{
-        TensorDims{FFOrdered<size_t>{
+        TensorDims{FFOrdered<nonnegative_int>{
             dim0_size,
-            16,
+            16_n,
             dim2_size,
         }},
         DataType::FLOAT,
     };
 
     TensorShape input_shape3 = TensorShape{
-        TensorDims{FFOrdered<size_t>{dim0_size, 18, dim2_size}},
+        TensorDims{FFOrdered<nonnegative_int>{dim0_size, 18_n, dim2_size}},
         DataType::FLOAT,
     };
 
     SUBCASE("input shapes do not shared the same num_dims") {
       TensorShape mismatched_num_dims = TensorShape{
-          TensorDims{FFOrdered<size_t>{
+          TensorDims{FFOrdered<nonnegative_int>{
               dim0_size,
-              20,
+              20_n,
               dim2_size,
-              1,
+              1_n,
           }},
           DataType::FLOAT,
       };
@@ -101,9 +101,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       tl::expected<TensorShape, std::string> result =
           get_output_shape(attrs, input_shapes);
       tl::expected<TensorShape, std::string> correct = TensorShape{
-          TensorDims{FFOrdered<size_t>{
+          TensorDims{FFOrdered<nonnegative_int>{
               dim0_size,
-              14 + 16 + 18,
+              14_n + 16_n + 18_n,
               dim2_size,
           }},
           DataType::FLOAT,
@@ -118,84 +118,97 @@ TEST_SUITE(FF_TEST_SUITE) {
         ff_dim_t{nonnegative_int{1}},
     };
 
-    size_t dim0_size = 12;
-    size_t dim2_size = 20;
+    nonnegative_int dim0_size = 12_n;
+    nonnegative_int dim2_size = 20_n;
 
     TensorShape input_shape1 = TensorShape{
-        TensorDims{FFOrdered<size_t>{
+        TensorDims{FFOrdered<nonnegative_int>{
             dim0_size,
-            14,
+            14_n,
             dim2_size,
         }},
         DataType::FLOAT,
     };
 
     TensorShape input_shape2 = TensorShape{
-        TensorDims{FFOrdered<size_t>{
+        TensorDims{FFOrdered<nonnegative_int>{
             dim0_size,
-            16,
+            16_n,
             dim2_size,
         }},
         DataType::FLOAT,
     };
 
     TensorShape input_shape3 = TensorShape{
-        TensorDims{FFOrdered<size_t>{dim0_size, 18, dim2_size}},
+        TensorDims{FFOrdered<nonnegative_int>{dim0_size, 18_n, dim2_size}},
         DataType::FLOAT,
     };
 
     TensorShape output_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{dim0_size, 14 + 16 + 18, dim2_size}},
+        TensorDims{FFOrdered<nonnegative_int>{
+            dim0_size, 14_n + 16_n + 18_n, dim2_size}},
         DataType::FLOAT,
     };
 
-    auto lift_input1 =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              input_shape1, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto lift_input1 = [&](SumDegree o_sum,
+                           DiscardCopyDegree o_eq,
+                           nonnegative_int o0,
+                           nonnegative_int o1,
+                           nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          input_shape1, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
-    auto lift_input2 =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              input_shape2, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto lift_input2 = [&](SumDegree o_sum,
+                           DiscardCopyDegree o_eq,
+                           nonnegative_int o0,
+                           nonnegative_int o1,
+                           nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          input_shape2, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
-    auto lift_input3 =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              input_shape3, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto lift_input3 = [&](SumDegree o_sum,
+                           DiscardCopyDegree o_eq,
+                           nonnegative_int o0,
+                           nonnegative_int o1,
+                           nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          input_shape3, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
-    auto lift_output =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              output_shape, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto lift_output = [&](SumDegree o_sum,
+                           DiscardCopyDegree o_eq,
+                           nonnegative_int o0,
+                           nonnegative_int o1,
+                           nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          output_shape, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
     SUBCASE("sum reduction parallelism") {
       SUBCASE("matching") {
-        SumDegree sum_degree = SumDegree{2};
+        SumDegree sum_degree = SumDegree{2_n};
 
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(sum_degree, DiscardCopyDegree{1}, 1, 1, 1),
-            lift_input2(sum_degree, DiscardCopyDegree{1}, 1, 1, 1),
-            lift_input3(sum_degree, DiscardCopyDegree{1}, 1, 1, 1),
+            lift_input1(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+            lift_input2(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+            lift_input3(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
         };
 
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, inputs);
         tl::expected<ParallelTensorShape, std::string> correct =
-            lift_output(sum_degree, DiscardCopyDegree{1}, 1, 1, 1);
+            lift_output(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
         CHECK(result == correct);
       }
 
       SUBCASE("not matching") {
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{2}, DiscardCopyDegree{1}, 1, 1, 1),
-            lift_input2(SumDegree{4}, DiscardCopyDegree{1}, 1, 1, 1),
-            lift_input3(SumDegree{4}, DiscardCopyDegree{1}, 1, 1, 1),
+            lift_input1(SumDegree{2_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+            lift_input2(SumDegree{4_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+            lift_input3(SumDegree{4_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
         };
 
         std::optional<ParallelTensorShape> result =
@@ -208,27 +221,27 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("discard copy reduction parallelism") {
       SUBCASE("matching") {
-        DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2};
+        DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n};
 
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{1}, discard_copy_degree, 1, 1, 1),
-            lift_input2(SumDegree{1}, discard_copy_degree, 1, 1, 1),
-            lift_input3(SumDegree{1}, discard_copy_degree, 1, 1, 1),
+            lift_input1(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n),
+            lift_input2(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n),
+            lift_input3(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n),
         };
 
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, inputs);
         tl::expected<ParallelTensorShape, std::string> correct =
-            lift_output(SumDegree{1}, discard_copy_degree, 1, 1, 1);
+            lift_output(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n);
 
         CHECK(result == correct);
       }
 
       SUBCASE("not matching") {
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{1}, DiscardCopyDegree{2}, 1, 1, 1),
-            lift_input2(SumDegree{1}, DiscardCopyDegree{2}, 1, 1, 1),
-            lift_input3(SumDegree{1}, DiscardCopyDegree{4}, 1, 1, 1),
+            lift_input1(SumDegree{1_n}, DiscardCopyDegree{2_n}, 1_n, 1_n, 1_n),
+            lift_input2(SumDegree{1_n}, DiscardCopyDegree{2_n}, 1_n, 1_n, 1_n),
+            lift_input3(SumDegree{1_n}, DiscardCopyDegree{4_n}, 1_n, 1_n, 1_n),
         };
 
         std::optional<ParallelTensorShape> result =
@@ -241,12 +254,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("parallelism in axis dim") {
       SUBCASE("matching") {
-        int degree = 2;
+        nonnegative_int degree = 2_n;
 
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1),
-            lift_input2(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1),
-            lift_input3(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1),
+            lift_input1(
+                SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n),
+            lift_input2(
+                SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n),
+            lift_input3(
+                SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n),
         };
 
         std::optional<ParallelTensorShape> result =
@@ -258,9 +274,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       SUBCASE("not matching") {
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, 1),
-            lift_input2(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, 1),
-            lift_input3(SumDegree{1}, DiscardCopyDegree{1}, 1, 2, 1),
+            lift_input1(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+            lift_input2(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+            lift_input3(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 2_n, 1_n),
         };
 
         std::optional<ParallelTensorShape> result =
@@ -273,31 +289,31 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("parallelism in non-axis shard dims") {
       SUBCASE("matching") {
-        int degree0 = 2;
-        int degree2 = 4;
+        nonnegative_int degree0 = 2_n;
+        nonnegative_int degree2 = 4_n;
 
         std::vector<ParallelTensorShape> inputs = {
             lift_input1(
-                SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2),
+                SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2),
             lift_input2(
-                SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2),
+                SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2),
             lift_input3(
-                SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2),
+                SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2),
         };
 
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, inputs);
         tl::expected<ParallelTensorShape, std::string> correct = lift_output(
-            SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2);
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2);
 
         CHECK(result == correct);
       }
 
       SUBCASE("not matching") {
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{1}, DiscardCopyDegree{1}, 2, 1, 4),
-            lift_input2(SumDegree{1}, DiscardCopyDegree{1}, 4, 1, 2),
-            lift_input3(SumDegree{1}, DiscardCopyDegree{1}, 4, 1, 2),
+            lift_input1(SumDegree{1_n}, DiscardCopyDegree{1_n}, 2_n, 1_n, 4_n),
+            lift_input2(SumDegree{1_n}, DiscardCopyDegree{1_n}, 4_n, 1_n, 2_n),
+            lift_input3(SumDegree{1_n}, DiscardCopyDegree{1_n}, 4_n, 1_n, 2_n),
         };
 
         std::optional<ParallelTensorShape> result =
@@ -309,21 +325,21 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("parallelism degrees are not mutually exclusive") {
-      SumDegree sum_degree = SumDegree{3};
-      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{5};
-      int degree0 = 2;
-      int degree2 = 4;
+      SumDegree sum_degree = SumDegree{3_n};
+      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{5_n};
+      nonnegative_int degree0 = 2_n;
+      nonnegative_int degree2 = 4_n;
 
       std::vector<ParallelTensorShape> inputs = {
-          lift_input1(sum_degree, discard_copy_degree, degree0, 1, degree2),
-          lift_input2(sum_degree, discard_copy_degree, degree0, 1, degree2),
-          lift_input3(sum_degree, discard_copy_degree, degree0, 1, degree2),
+          lift_input1(sum_degree, discard_copy_degree, degree0, 1_n, degree2),
+          lift_input2(sum_degree, discard_copy_degree, degree0, 1_n, degree2),
+          lift_input3(sum_degree, discard_copy_degree, degree0, 1_n, degree2),
       };
 
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, inputs);
       tl::expected<ParallelTensorShape, std::string> correct =
-          lift_output(sum_degree, discard_copy_degree, degree0, 1, degree2);
+          lift_output(sum_degree, discard_copy_degree, degree0, 1_n, degree2);
 
       CHECK(result == correct);
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc
index 7abb98f3e3..f5006d4352 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc
@@ -7,14 +7,14 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_conv2d_incoming_tensor_roles(Conv2DAttrs") {
     auto make_attrs = [](bool use_bias) {
-      return Conv2DAttrs{/*out_channels=*/4,
-                         /*kernel_h=*/3,
-                         /*kernel_w=*/2,
-                         /*stride_h=*/2,
-                         /*stride_w=*/2,
-                         /*padding_h=*/1,
-                         /*padding_w=*/1,
-                         /*groups=*/1,
+      return Conv2DAttrs{/*out_channels=*/4_n,
+                         /*kernel_h=*/3_n,
+                         /*kernel_w=*/2_n,
+                         /*stride_h=*/2_n,
+                         /*stride_w=*/2_n,
+                         /*padding_h=*/1_n,
+                         /*padding_w=*/1_n,
+                         /*groups=*/1_n,
                          /*activation=*/std::nullopt,
                          /*use_bias=*/use_bias};
     };
@@ -48,14 +48,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("Conv2D shape inference") {
-    int out_channels = 4;
-    int kernel_h = 3;
-    int kernel_w = 2;
-    int stride_h = 2;
-    int stride_w = 2;
-    int padding_h = 1;
-    int padding_w = 1;
-    int groups = 1;
+    nonnegative_int out_channels = 4_n;
+    nonnegative_int kernel_h = 3_n;
+    nonnegative_int kernel_w = 2_n;
+    nonnegative_int stride_h = 2_n;
+    nonnegative_int stride_w = 2_n;
+    nonnegative_int padding_h = 1_n;
+    nonnegative_int padding_w = 1_n;
+    nonnegative_int groups = 1_n;
     std::optional<Activation> activation = std::nullopt;
     bool use_bias = true;
 
@@ -72,13 +72,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*use_bias=*/true,
     };
 
-    size_t num_samples = 7;
-    size_t input_channels = 4;
-    size_t input_height = 11;
-    size_t input_width = 15;
+    nonnegative_int num_samples = 7_n;
+    nonnegative_int input_channels = 4_n;
+    nonnegative_int input_height = 11_n;
+    nonnegative_int input_width = 15_n;
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
+        TensorDims{FFOrdered<nonnegative_int>{
             num_samples,
             input_channels,
             input_height,
@@ -87,13 +87,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         DataType::FLOAT,
     };
 
-    size_t output_height = 6;
-    size_t output_width = 8;
+    nonnegative_int output_height = 6_n;
+    nonnegative_int output_width = 8_n;
 
     TensorShape output = TensorShape{
-        TensorDims{FFOrdered<size_t>{
+        TensorDims{FFOrdered<nonnegative_int>{
             num_samples,
-            size_t_from_int(out_channels),
+            out_channels,
             output_height,
             output_width,
         }},
@@ -101,18 +101,18 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape kernel = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            size_t_from_int(out_channels),
+        TensorDims{FFOrdered<nonnegative_int>{
+            out_channels,
             input_channels,
-            size_t_from_int(kernel_h),
-            size_t_from_int(kernel_w),
+            kernel_h,
+            kernel_w,
         }},
         DataType::FLOAT,
     };
 
     TensorShape bias = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            size_t_from_int(out_channels),
+        TensorDims{FFOrdered<nonnegative_int>{
+            out_channels,
         }},
         DataType::FLOAT,
     };
@@ -137,147 +137,149 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input = [&](SumDegree o_sum,
                           DiscardCopyDegree o_eq,
-                          int o_n,
-                          int o_c,
-                          int o_h,
-                          int o_w) {
+                          nonnegative_int o_n,
+                          nonnegative_int o_c,
+                          nonnegative_int o_h,
+                          nonnegative_int o_w) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<int>{o_n, o_c, o_h, o_w});
+          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o_n, o_c, o_h, o_w});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           int o_n,
-                           int o_c,
-                           int o_h,
-                           int o_w) {
+                           nonnegative_int o_n,
+                           nonnegative_int o_c,
+                           nonnegative_int o_h,
+                           nonnegative_int o_w) {
       return lift_to_parallel_with_degrees(
-          output, o_sum, o_eq, FFOrdered<int>{o_n, o_c, o_h, o_w});
+          output, o_sum, o_eq, FFOrdered<nonnegative_int>{o_n, o_c, o_h, o_w});
     };
 
     auto make_kernel = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           int o_outchannels,
-                           int o_inchannels,
-                           int o_kernel_h,
-                           int o_kernel_w) {
+                           nonnegative_int o_outchannels,
+                           nonnegative_int o_inchannels,
+                           nonnegative_int o_kernel_h,
+                           nonnegative_int o_kernel_w) {
       return lift_to_parallel_with_degrees(
           kernel,
           o_sum,
           o_eq,
-          FFOrdered<int>{o_outchannels, o_inchannels, o_kernel_h, o_kernel_w});
+          FFOrdered<nonnegative_int>{
+              o_outchannels, o_inchannels, o_kernel_h, o_kernel_w});
     };
 
-    auto make_bias =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o_outchannels) {
-          return lift_to_parallel_with_degrees(
-              bias, o_sum, o_eq, FFOrdered<int>{o_outchannels});
-        };
+    auto make_bias = [&](SumDegree o_sum,
+                         DiscardCopyDegree o_eq,
+                         nonnegative_int o_outchannels) {
+      return lift_to_parallel_with_degrees(
+          bias, o_sum, o_eq, FFOrdered<nonnegative_int>{o_outchannels});
+    };
 
     SUBCASE("data parallelism") {
-      int degree = 2;
-      ParallelTensorShape par_input =
-          make_input(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1, 1);
+      nonnegative_int degree = 2_n;
+      ParallelTensorShape par_input = make_input(
+          SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n, 1_n);
 
       SUBCASE("get_output_shape") {
         ParallelTensorShape result = get_output_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_output(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1, 1);
+        ParallelTensorShape correct = make_output(
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_kernel_shape") {
         ParallelTensorShape result = get_kernel_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_kernel(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1, 1);
+        ParallelTensorShape correct = make_kernel(
+            SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_bias_shape") {
         ParallelTensorShape result = get_bias_shape(attrs, par_input);
         ParallelTensorShape correct =
-            make_bias(SumDegree{1}, DiscardCopyDegree{degree}, 1);
+            make_bias(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("input channel parallelism") {
-      int degree = 2;
-      ParallelTensorShape par_input =
-          make_input(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1, 1);
+      nonnegative_int degree = 2_n;
+      ParallelTensorShape par_input = make_input(
+          SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n, 1_n);
 
       SUBCASE("get_output_shape") {
         ParallelTensorShape result = get_output_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_output(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1, 1);
+        ParallelTensorShape correct = make_output(
+            SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_kernel_shape") {
         ParallelTensorShape result = get_kernel_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_kernel(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1, 1);
+        ParallelTensorShape correct = make_kernel(
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_bias_shape") {
         ParallelTensorShape result = get_bias_shape(attrs, par_input);
         ParallelTensorShape correct =
-            make_bias(SumDegree{degree}, DiscardCopyDegree{1}, 1);
+            make_bias(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("output channel parallelism") {
-      int degree = 2;
-      ParallelTensorShape par_input =
-          make_input(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1, 1);
+      nonnegative_int degree = 2_n;
+      ParallelTensorShape par_input = make_input(
+          SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n, 1_n);
 
       SUBCASE("get_output_shape") {
         ParallelTensorShape result = get_output_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_output(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1, 1);
+        ParallelTensorShape correct = make_output(
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_kernel_shape") {
         ParallelTensorShape result = get_kernel_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_kernel(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1, 1);
+        ParallelTensorShape correct = make_kernel(
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_bias_shape") {
         ParallelTensorShape result = get_bias_shape(attrs, par_input);
         ParallelTensorShape correct =
-            make_bias(SumDegree{1}, DiscardCopyDegree{1}, degree);
+            make_bias(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("propagating sum degree") {
-      int degree = 2;
-      ParallelTensorShape par_input =
-          make_input(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1, 1);
+      nonnegative_int degree = 2_n;
+      ParallelTensorShape par_input = make_input(
+          SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n);
 
       SUBCASE("get_output_shape") {
         ParallelTensorShape result = get_output_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_output(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1, 1);
+        ParallelTensorShape correct = make_output(
+            SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_kernel_shape") {
         ParallelTensorShape result = get_kernel_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_kernel(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1, 1);
+        ParallelTensorShape correct = make_kernel(
+            SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_bias_shape") {
         ParallelTensorShape result = get_bias_shape(attrs, par_input);
         ParallelTensorShape correct =
-            make_bias(SumDegree{degree}, DiscardCopyDegree{1}, 1);
+            make_bias(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n);
         CHECK(result == correct);
       }
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/dropout.cc b/lib/op-attrs/test/src/op-attrs/ops/dropout.cc
index 7580de24e5..e1a03a7613 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/dropout.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/dropout.cc
@@ -15,10 +15,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            12,
-            14,
-            16,
+        TensorDims{FFOrdered<nonnegative_int>{
+            12_n,
+            14_n,
+            16_n,
         }},
         DataType::FLOAT,
     };
@@ -36,48 +36,54 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            12,
-            14,
-            16,
+        TensorDims{FFOrdered<nonnegative_int>{
+            12_n,
+            14_n,
+            16_n,
         }},
         DataType::FLOAT,
     };
 
     TensorShape output = input;
 
-    auto make_input =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              input, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto make_input = [&](SumDegree o_sum,
+                          DiscardCopyDegree o_eq,
+                          nonnegative_int o0,
+                          nonnegative_int o1,
+                          nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
-    auto make_output =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              output, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto make_output = [&](SumDegree o_sum,
+                           DiscardCopyDegree o_eq,
+                           nonnegative_int o0,
+                           nonnegative_int o1,
+                           nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          output, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
     SUBCASE("partition parallelism (allowed)") {
-      int degree0 = 2;
-      int degree2 = 4;
+      nonnegative_int degree0 = 2_n;
+      nonnegative_int degree2 = 4_n;
 
-      ParallelTensorShape par_input =
-          make_input(SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2);
+      ParallelTensorShape par_input = make_input(
+          SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2);
 
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, par_input);
-      tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2);
+      tl::expected<ParallelTensorShape, std::string> correct = make_output(
+          SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2);
 
       CHECK(result == correct);
     }
 
     SUBCASE("sum parallelism (not allowed)") {
-      SumDegree sum_degree = SumDegree{2};
+      SumDegree sum_degree = SumDegree{2_n};
 
       ParallelTensorShape par_input =
-          make_input(sum_degree, DiscardCopyDegree{1}, 1, 1, 1);
+          make_input(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
       std::optional<ParallelTensorShape> result =
           optional_from_expected(get_output_shape(attrs, par_input));
@@ -87,10 +93,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("discard copy parallelism (not allowed)") {
-      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2};
+      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n};
 
       ParallelTensorShape par_input =
-          make_input(SumDegree{1}, discard_copy_degree, 1, 1, 1);
+          make_input(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n);
 
       std::optional<ParallelTensorShape> result =
           optional_from_expected(get_output_shape(attrs, par_input));
diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc
index d5aab55cb2..d6a92036f0 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc
@@ -7,9 +7,9 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("EWAdd shape inference") {
-    size_t d1 = 16;
-    size_t d2 = 32;
-    size_t d3 = 24;
+    nonnegative_int d1 = 16_n;
+    nonnegative_int d2 = 32_n;
+    nonnegative_int d3 = 24_n;
 
     ElementBinaryAttrs attrs = ElementBinaryAttrs{
         OperatorType::EW_ADD,
@@ -20,7 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_lhs = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 d1,
                 d2,
                 d3,
@@ -41,7 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("mismatched dim size") {
       TensorShape incorrect_rhs = input_lhs;
-      dim_at_idx(incorrect_rhs, relative_ff_dim_t{0}) += 1;
+      dim_at_idx(incorrect_rhs, relative_ff_dim_t{0}) += 1_n;
 
       tl::expected<TensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, incorrect_rhs);
@@ -53,9 +53,9 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("EWAdd parallel shape inference") {
-    size_t d1 = 16;
-    size_t d2 = 32;
-    size_t d3 = 24;
+    nonnegative_int d1 = 16_n;
+    nonnegative_int d2 = 32_n;
+    nonnegative_int d3 = 24_n;
 
     ElementBinaryAttrs attrs = ElementBinaryAttrs{
         OperatorType::EW_ADD,
@@ -66,7 +66,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape unpar_lhs = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 d1,
                 d2,
                 d3,
@@ -83,68 +83,68 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_lhs = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_1,
-                        int o_2,
-                        int o_3) {
+                        nonnegative_int o_1,
+                        nonnegative_int o_2,
+                        nonnegative_int o_3) {
       return lift_to_parallel_with_degrees(
-          unpar_lhs, o_sum, o_eq, FFOrdered<int>{o_1, o_2, o_3});
+          unpar_lhs, o_sum, o_eq, FFOrdered<nonnegative_int>{o_1, o_2, o_3});
     };
 
     auto make_rhs = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_1,
-                        int o_2,
-                        int o_3) {
+                        nonnegative_int o_1,
+                        nonnegative_int o_2,
+                        nonnegative_int o_3) {
       return lift_to_parallel_with_degrees(
-          unpar_rhs, o_sum, o_eq, FFOrdered<int>{o_1, o_2, o_3});
+          unpar_rhs, o_sum, o_eq, FFOrdered<nonnegative_int>{o_1, o_2, o_3});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           int o_1,
-                           int o_2,
-                           int o_3) {
+                           nonnegative_int o_1,
+                           nonnegative_int o_2,
+                           nonnegative_int o_3) {
       return lift_to_parallel_with_degrees(
-          unpar_output, o_sum, o_eq, FFOrdered<int>{o_1, o_2, o_3});
+          unpar_output, o_sum, o_eq, FFOrdered<nonnegative_int>{o_1, o_2, o_3});
     };
 
     SUBCASE("data parallelism") {
-      int degree = 4;
+      nonnegative_int degree = 4_n;
 
       ParallelTensorShape input_lhs =
-          make_lhs(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1);
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n);
       ParallelTensorShape input_rhs =
-          make_rhs(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1);
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n);
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, input_rhs);
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1);
+          make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n);
 
       CHECK(result == correct);
     }
 
     SUBCASE("reduction parallelism") {
-      int degree = 4;
+      nonnegative_int degree = 4_n;
 
       ParallelTensorShape input_lhs =
-          make_lhs(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1);
+          make_lhs(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
       ParallelTensorShape input_rhs =
-          make_rhs(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1);
+          make_rhs(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, input_rhs);
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1);
+          make_output(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
       CHECK(result == correct);
     }
 
     SUBCASE("invalid discard copy parallelism") {
-      int degree = 4;
+      nonnegative_int degree = 4_n;
 
       ParallelTensorShape input_lhs =
-          make_lhs(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1);
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n);
       ParallelTensorShape input_rhs =
-          make_rhs(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1);
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n);
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, input_rhs);
 
@@ -154,12 +154,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("invalid mismatched parallelism degrees") {
-      int degree = 4;
+      nonnegative_int degree = 4_n;
 
       ParallelTensorShape input_lhs =
-          make_lhs(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1);
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n);
       ParallelTensorShape input_rhs =
-          make_rhs(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, degree);
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, degree);
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, input_rhs);
 
diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
index 94c382356e..bac6efba3f 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
@@ -7,16 +7,16 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ReLU shape inference") {
-    size_t d1 = 16;
-    size_t d2 = 32;
-    size_t d3 = 24;
+    nonnegative_int d1 = 16_n;
+    nonnegative_int d2 = 32_n;
+    nonnegative_int d3 = 24_n;
 
     ElementUnaryAttrs attrs =
         ElementUnaryAttrs{OperatorType::RELU, std::nullopt};
 
     TensorShape input = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 d1,
                 d2,
                 d3,
@@ -31,20 +31,20 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     CHECK(result == correct);
 
-    auto make_i = [&](SumDegree o_sum,
-                      DiscardCopyDegree o_eq,
-                      int o_1,
-                      int o_2,
-                      int o_3) {
+    auto make_input = [&](SumDegree o_sum,
+                          DiscardCopyDegree o_eq,
+                          nonnegative_int o_1,
+                          nonnegative_int o_2,
+                          nonnegative_int o_3) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<int>{o_1, o_2, o_3});
+          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o_1, o_2, o_3});
     };
 
     SUBCASE("partition i.e., sharding parallelism") {
-      int degree1 = 4;
-      int degree2 = 8;
-      ParallelTensorShape par_input =
-          make_i(SumDegree{1}, DiscardCopyDegree{1}, degree1, 1, degree2);
+      nonnegative_int degree1 = 4_n;
+      nonnegative_int degree2 = 8_n;
+      ParallelTensorShape par_input = make_input(
+          SumDegree{1_n}, DiscardCopyDegree{1_n}, degree1, 1_n, degree2);
 
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, par_input);
@@ -54,10 +54,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("sum degree > 1") {
-      int degree = 2;
+      nonnegative_int degree = 2_n;
 
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
-          attrs, make_i(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1));
+          attrs,
+          make_input(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n));
 
       CHECK_MESSAGE(!result.has_value(),
                     "Unexpected successful result: ",
@@ -65,10 +66,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("discard copy degree > 1") {
-      int degree = 2;
+      nonnegative_int degree = 2_n;
 
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
-          attrs, make_i(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1));
+          attrs,
+          make_input(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n));
 
       CHECK_MESSAGE(!result.has_value(),
                     "Unexpected successful result: ",
diff --git a/lib/op-attrs/test/src/op-attrs/ops/embedding.cc b/lib/op-attrs/test/src/op-attrs/ops/embedding.cc
index 134737f6c0..8fe50a4217 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/embedding.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/embedding.cc
@@ -8,8 +8,8 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Sum embedding shape inference") {
-    int out_channels = 128;
-    int num_entries = 1024;
+    nonnegative_int out_channels = 128_n;
+    nonnegative_int num_entries = 1024_n;
     EmbeddingAttrs attrs = EmbeddingAttrs{
         /*num_entries=*/num_entries,
         /*out_channels=*/out_channels,
@@ -17,11 +17,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*data_type=*/DataType::FLOAT,
     };
 
-    size_t batch_size = 48;
-    size_t features_dim = 56;
+    nonnegative_int batch_size = 48_n;
+    nonnegative_int features_dim = 56_n;
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
+        TensorDims{FFOrdered<nonnegative_int>{
             batch_size,
             features_dim,
         }},
@@ -30,9 +30,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape output = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 batch_size,
-                size_t_from_int(out_channels),
+                out_channels,
             },
         },
         DataType::FLOAT,
@@ -40,9 +40,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape weights = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
-                size_t_from_int(num_entries),
-                size_t_from_int(out_channels),
+            FFOrdered<nonnegative_int>{
+                num_entries,
+                out_channels,
             },
         },
         DataType::FLOAT,
@@ -66,38 +66,44 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input = [&](SumDegree o_sum,
                           DiscardCopyDegree o_eq,
-                          int o_batch,
-                          int o_features) {
+                          nonnegative_int o_batch,
+                          nonnegative_int o_features) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<int>{o_batch, o_features});
+          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o_batch, o_features});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           int o_batch,
-                           int o_outchannels) {
+                           nonnegative_int o_batch,
+                           nonnegative_int o_outchannels) {
       return lift_to_parallel_with_degrees(
-          output, o_sum, o_eq, FFOrdered<int>{o_batch, o_outchannels});
+          output,
+          o_sum,
+          o_eq,
+          FFOrdered<nonnegative_int>{o_batch, o_outchannels});
     };
 
     auto make_weights = [&](SumDegree o_sum,
                             DiscardCopyDegree o_eq,
-                            int o_entries,
-                            int o_outchannels) {
+                            nonnegative_int o_entries,
+                            nonnegative_int o_outchannels) {
       return lift_to_parallel_with_degrees(
-          weights, o_sum, o_eq, FFOrdered<int>{o_entries, o_outchannels});
+          weights,
+          o_sum,
+          o_eq,
+          FFOrdered<nonnegative_int>{o_entries, o_outchannels});
     };
 
     SUBCASE("data parallelism") {
-      int degree = 4;
+      nonnegative_int degree = 4_n;
       ParallelTensorShape par_input =
-          make_input(SumDegree{1}, DiscardCopyDegree{1}, degree, 1);
+          make_input(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_output(SumDegree{1}, DiscardCopyDegree{1}, degree, 1);
+            make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n);
         CHECK(result == correct);
       }
 
@@ -105,21 +111,21 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_weights_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_weights(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1);
+            make_weights(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("input features parallelism") {
-      int degree = 4;
+      nonnegative_int degree = 4_n;
       ParallelTensorShape input =
-          make_input(SumDegree{1}, DiscardCopyDegree{1}, 1, degree);
+          make_input(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_output(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1);
+            make_output(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n);
         CHECK(result == correct);
       }
 
@@ -127,7 +133,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_weights_shape(attrs, input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_weights(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1);
+            make_weights(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n);
         CHECK(result == correct);
       }
     }
@@ -139,15 +145,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       // dimension. For now we choose to represent parallelism in the channel
       // dimension, but partitioning in the entry dimension is also potentially
       // useful as it produces sum parallelism in the output
-      int degree = 4;
+      nonnegative_int degree = 4_n;
       ParallelTensorShape input =
-          make_input(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1);
+          make_input(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_output(SumDegree{1}, DiscardCopyDegree{1}, 1, degree);
+            make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree);
         CHECK(result == correct);
       }
 
@@ -155,7 +161,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_weights_shape(attrs, input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_weights(SumDegree{1}, DiscardCopyDegree{1}, 1, degree);
+            make_weights(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree);
         CHECK(result == correct);
       }
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/flat.cc b/lib/op-attrs/test/src/op-attrs/ops/flat.cc
index 8998dfaffd..ebd869b3e5 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/flat.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/flat.cc
@@ -9,25 +9,25 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_output_shape(FlatAttrs, TensorShape)") {
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            2,
-            4,
-            2,
-            3,
+        TensorDims{FFOrdered<nonnegative_int>{
+            2_n,
+            4_n,
+            2_n,
+            3_n,
         }},
         DataType::FLOAT,
     };
 
     SUBCASE("flatten all dims") {
       FlatAttrs attrs = FlatAttrs{
-          /*start_dim=*/ff_dim_t{nonnegative_int{0}},
-          /*end_dim=*/ff_dim_t{nonnegative_int{4}},
+          /*start_dim=*/ff_dim_t{0_n},
+          /*end_dim=*/ff_dim_t{4_n},
       };
 
       TensorShape result = get_output_shape(attrs, input_shape);
       TensorShape correct = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              2 * 4 * 2 * 3,
+          TensorDims{FFOrdered<nonnegative_int>{
+              2_n * 4_n * 2_n * 3_n,
           }},
           DataType::FLOAT,
       };
@@ -43,10 +43,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       TensorShape result = get_output_shape(attrs, input_shape);
       TensorShape correct = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              2,
-              4,
-              2 * 3,
+          TensorDims{FFOrdered<nonnegative_int>{
+              2_n,
+              4_n,
+              2_n * 3_n,
           }},
           DataType::FLOAT,
       };
@@ -62,10 +62,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       TensorShape result = get_output_shape(attrs, input_shape);
       TensorShape correct = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              2 * 4,
-              2,
-              3,
+          TensorDims{FFOrdered<nonnegative_int>{
+              2_n * 4_n,
+              2_n,
+              3_n,
           }},
           DataType::FLOAT,
       };
@@ -81,10 +81,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       TensorShape result = get_output_shape(attrs, input_shape);
       TensorShape correct = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              2,
-              4 * 2,
-              3,
+          TensorDims{FFOrdered<nonnegative_int>{
+              2_n,
+              4_n * 2_n,
+              3_n,
           }},
           DataType::FLOAT,
       };
@@ -124,18 +124,18 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("allows shard parallelism in non-flattened dims") {
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{1},
-          FFOrdered<int>{2, 1, 1, 3},
+          SumDegree{1_n},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{2_n, 1_n, 1_n, 3_n},
       };
 
       tl::expected<ParallelTensorDimDegrees, std::string> result =
           get_output_parallel_dim_degrees(attrs, input);
       tl::expected<ParallelTensorDimDegrees, std::string> correct =
           ParallelTensorDimDegrees{
-              SumDegree{1},
-              DiscardCopyDegree{1},
-              FFOrdered<int>{2, 1, 3},
+              SumDegree{1_n},
+              DiscardCopyDegree{1_n},
+              FFOrdered<nonnegative_int>{2_n, 1_n, 3_n},
           };
 
       CHECK(result == correct);
@@ -143,9 +143,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("does not allow shard parallelism in flattened dims") {
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{1},
-          FFOrdered<int>{1, 1, 2, 1},
+          SumDegree{1_n},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{1_n, 1_n, 2_n, 1_n},
       };
 
       std::optional<ParallelTensorDimDegrees> result =
@@ -157,18 +157,18 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("allows sum parallelism") {
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{2},
-          DiscardCopyDegree{1},
-          FFOrdered<int>{1, 1, 1, 1},
+          SumDegree{2_n},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 1_n},
       };
 
       std::optional<ParallelTensorDimDegrees> result =
           optional_from_expected(get_output_parallel_dim_degrees(attrs, input));
       std::optional<ParallelTensorDimDegrees> correct =
           ParallelTensorDimDegrees{
-              SumDegree{2},
-              DiscardCopyDegree{1},
-              FFOrdered<int>{1, 1, 1},
+              SumDegree{2_n},
+              DiscardCopyDegree{1_n},
+              FFOrdered<nonnegative_int>{1_n, 1_n, 1_n},
           };
 
       CHECK(result == correct);
@@ -176,18 +176,18 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("allows discard copy parallelism") {
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{2},
-          FFOrdered<int>{1, 1, 1, 1},
+          SumDegree{1_n},
+          DiscardCopyDegree{2_n},
+          FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 1_n},
       };
 
       std::optional<ParallelTensorDimDegrees> result =
           optional_from_expected(get_output_parallel_dim_degrees(attrs, input));
       std::optional<ParallelTensorDimDegrees> correct =
           ParallelTensorDimDegrees{
-              SumDegree{1},
-              DiscardCopyDegree{2},
-              FFOrdered<int>{1, 1, 1},
+              SumDegree{1_n},
+              DiscardCopyDegree{2_n},
+              FFOrdered<nonnegative_int>{1_n, 1_n, 1_n},
           };
 
       CHECK(result == correct);
@@ -203,22 +203,22 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{4, 2},
-                ShardParallelDim{8, 1},
-                ShardParallelDim{6, 1},
-                ShardParallelDim{9, 3},
+                ShardParallelDim{4_n, 2_n},
+                ShardParallelDim{8_n, 1_n},
+                ShardParallelDim{6_n, 1_n},
+                ShardParallelDim{9_n, 3_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{7},
-                DiscardCopyDegree{5},
+                SumDegree{7_n},
+                DiscardCopyDegree{5_n},
             },
         },
         DataType::FLOAT,
     };
 
     FlatAttrs attrs = FlatAttrs{
-        /*start_dim=*/ff_dim_t{nonnegative_int{1}},
-        /*end_dim=*/ff_dim_t{nonnegative_int{3}},
+        /*start_dim=*/ff_dim_t{nonnegative_int{1_n}},
+        /*end_dim=*/ff_dim_t{nonnegative_int{3_n}},
     };
 
     tl::expected<ParallelTensorShape, std::string> result =
@@ -227,13 +227,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         ParallelTensorShape{
             ParallelTensorDims{
                 FFOrdered<ShardParallelDim>{
-                    ShardParallelDim{4, 2},
-                    ShardParallelDim{8 * 6, 1},
-                    ShardParallelDim{9, 3},
+                    ShardParallelDim{4_n, 2_n},
+                    ShardParallelDim{8_n * 6_n, 1_n},
+                    ShardParallelDim{9_n, 3_n},
                 },
                 ReplicaParallelDimSet{
-                    SumDegree{7},
-                    DiscardCopyDegree{5},
+                    SumDegree{7_n},
+                    DiscardCopyDegree{5_n},
                 },
             },
             DataType::FLOAT,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc
index b9426a89a2..b9aa3c0677 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc
@@ -58,11 +58,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }();
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            12,
-            14,
-            16,
-            18,
+        TensorDims{FFOrdered<nonnegative_int>{
+            12_n,
+            14_n,
+            16_n,
+            18_n,
         }},
         DataType::FLOAT,
     };
@@ -70,9 +70,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape output = input;
 
     TensorShape gamma = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            12,
-            16,
+        TensorDims{FFOrdered<nonnegative_int>{
+            12_n,
+            16_n,
         }},
         DataType::FLOAT,
     };
@@ -125,49 +125,58 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input = [&](SumDegree o_sum,
                           DiscardCopyDegree o_eq,
-                          int o0,
-                          int o1,
-                          int o2,
-                          int o3) {
+                          nonnegative_int o0,
+                          nonnegative_int o1,
+                          nonnegative_int o2,
+                          nonnegative_int o3) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<int>{o0, o1, o2, o3});
+          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2, o3});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           int o0,
-                           int o1,
-                           int o2,
-                           int o3) {
+                           nonnegative_int o0,
+                           nonnegative_int o1,
+                           nonnegative_int o2,
+                           nonnegative_int o3) {
       return lift_to_parallel_with_degrees(
-          output, o_sum, o_eq, FFOrdered<int>{o0, o1, o2, o3});
+          output, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2, o3});
     };
 
-    auto make_gamma_weights =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o2) {
-          return lift_to_parallel_with_degrees(
-              gamma, o_sum, o_eq, FFOrdered<int>{o0, o2});
-        };
+    auto make_gamma_weights = [&](SumDegree o_sum,
+                                  DiscardCopyDegree o_eq,
+                                  nonnegative_int o0,
+                                  nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          gamma, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o2});
+    };
 
-    auto make_beta_weights =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o2) {
-          return lift_to_parallel_with_degrees(
-              beta, o_sum, o_eq, FFOrdered<int>{o0, o2});
-        };
+    auto make_beta_weights = [&](SumDegree o_sum,
+                                 DiscardCopyDegree o_eq,
+                                 nonnegative_int o0,
+                                 nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          beta, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o2});
+    };
 
     SUBCASE("parallel shape inference (LayerNorm)") {
       SUBCASE("partition parallelism (not in axes)") {
-        int degree0 = 2;
-        int degree2 = 3;
+        nonnegative_int degree0 = 2_n;
+        nonnegative_int degree2 = 3_n;
 
         ParallelTensorShape par_input = make_input(
-            SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2, 1);
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2, 1_n);
 
         SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_shape(attrs_affine_true, par_input);
-          tl::expected<ParallelTensorShape, std::string> correct = make_output(
-              SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2, 1);
+          tl::expected<ParallelTensorShape, std::string> correct =
+              make_output(SumDegree{1_n},
+                          DiscardCopyDegree{1_n},
+                          degree0,
+                          1_n,
+                          degree2,
+                          1_n);
 
           CHECK(result == correct);
         }
@@ -179,7 +188,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                 get_gamma_weights_shape(attrs_affine_true, par_input);
             tl::expected<ParallelTensorShape, std::string> correct =
                 make_gamma_weights(
-                    SumDegree{1}, DiscardCopyDegree{1}, degree0, degree2);
+                    SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, degree2);
 
             CHECK(result == correct);
           }
@@ -199,7 +208,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                 get_beta_weights_shape(attrs_affine_true, par_input);
             tl::expected<ParallelTensorShape, std::string> correct =
                 make_beta_weights(
-                    SumDegree{1}, DiscardCopyDegree{1}, degree0, degree2);
+                    SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, degree2);
 
             CHECK(result == correct);
           }
@@ -215,11 +224,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("partition parallelism (in axes)") {
-        int degree1 = 2;
-        int degree2 = 4;
+        nonnegative_int degree1 = 2_n;
+        nonnegative_int degree2 = 4_n;
 
         ParallelTensorShape par_input = make_input(
-            SumDegree{1}, DiscardCopyDegree{1}, 1, degree1, degree2, 1);
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree1, degree2, 1_n);
 
         SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") {
           std::optional<ParallelTensorShape> result = optional_from_expected(
@@ -248,10 +257,10 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("sum parallelism") {
-        SumDegree sum_degree = SumDegree{2};
+        SumDegree sum_degree = SumDegree{2_n};
 
         ParallelTensorShape par_input =
-            make_input(sum_degree, DiscardCopyDegree{1}, 1, 1, 1, 1);
+            make_input(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n);
 
         SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") {
           std::optional<ParallelTensorShape> result = optional_from_expected(
@@ -280,10 +289,10 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("discard copy parallelism") {
-        DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2};
+        DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n};
 
         ParallelTensorShape par_input =
-            make_input(SumDegree{1}, discard_copy_degree, 1, 1, 1, 1);
+            make_input(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n, 1_n);
 
         SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") {
           std::optional<ParallelTensorShape> result = optional_from_expected(
diff --git a/lib/op-attrs/test/src/op-attrs/ops/linear.cc b/lib/op-attrs/test/src/op-attrs/ops/linear.cc
index 191515b062..eaa99ef099 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/linear.cc
@@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_linear_incoming_tensor_roles(LinearAttrs)") {
     auto make_attrs = [](bool use_bias) {
       return LinearAttrs{
-          /*out_channels=*/16,
+          /*out_channels=*/16_n,
           /*use_bias=*/use_bias,
           /*data_type=*/DataType::FLOAT,
           /*activation=*/Activation::RELU,
@@ -47,7 +47,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("Linear shape inference") {
-    int out_channels = 16;
+    nonnegative_int out_channels = 16_n;
     LinearAttrs attrs = LinearAttrs{
         /*out_channels=*/out_channels,
         /*use_bias=*/true,
@@ -56,13 +56,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*regularizer=*/std::nullopt,
     };
 
-    size_t batch_size = 12;
-    size_t extra_dim = 16;
-    size_t in_channels = 8;
+    nonnegative_int batch_size = 12_n;
+    nonnegative_int extra_dim = 16_n;
+    nonnegative_int in_channels = 8_n;
 
     TensorShape input = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 batch_size,
                 extra_dim,
                 in_channels,
@@ -73,10 +73,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape output = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 batch_size,
                 extra_dim,
-                size_t_from_int(out_channels),
+                out_channels,
             },
         },
         DataType::FLOAT,
@@ -84,9 +84,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape projection = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 in_channels,
-                size_t_from_int(out_channels),
+                out_channels,
             },
         },
         DataType::FLOAT,
@@ -94,8 +94,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape bias = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
-                size_t_from_int(out_channels),
+            FFOrdered<nonnegative_int>{
+                out_channels,
             },
         },
         DataType::FLOAT,
@@ -127,56 +127,66 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input = [&](SumDegree o_sum,
                           DiscardCopyDegree o_eq,
-                          int o_batch,
-                          int o_extra_dim,
-                          int o_channel) {
+                          nonnegative_int o_batch,
+                          nonnegative_int o_extra_dim,
+                          nonnegative_int o_channel) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<int>{o_batch, o_extra_dim, o_channel});
+          input,
+          o_sum,
+          o_eq,
+          FFOrdered<nonnegative_int>{o_batch, o_extra_dim, o_channel});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           int o_batch,
-                           int o_extra_dim,
-                           int o_channel) {
+                           nonnegative_int o_batch,
+                           nonnegative_int o_extra_dim,
+                           nonnegative_int o_channel) {
       return lift_to_parallel_with_degrees(
-          output, o_sum, o_eq, FFOrdered<int>{o_batch, o_extra_dim, o_channel});
+          output,
+          o_sum,
+          o_eq,
+          FFOrdered<nonnegative_int>{o_batch, o_extra_dim, o_channel});
     };
 
     auto make_projection = [&](SumDegree o_sum,
                                DiscardCopyDegree o_eq,
-                               int o_inchannel,
-                               int o_outchannel) {
+                               nonnegative_int o_inchannel,
+                               nonnegative_int o_outchannel) {
       return lift_to_parallel_with_degrees(
-          projection, o_sum, o_eq, FFOrdered<int>{o_inchannel, o_outchannel});
+          projection,
+          o_sum,
+          o_eq,
+          FFOrdered<nonnegative_int>{o_inchannel, o_outchannel});
     };
 
-    auto make_bias =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o_outchannel) {
-          return lift_to_parallel_with_degrees(
-              bias, o_sum, o_eq, FFOrdered<int>{o_outchannel});
-        };
+    auto make_bias = [&](SumDegree o_sum,
+                         DiscardCopyDegree o_eq,
+                         nonnegative_int o_outchannel) {
+      return lift_to_parallel_with_degrees(
+          bias, o_sum, o_eq, FFOrdered<nonnegative_int>{o_outchannel});
+    };
 
     SUBCASE("data parallelism") {
-      int input_sum_degree = 2;
-      int extra_dim_degree = 8;
-      int degree = 4;
+      nonnegative_int input_sum_degree = 2_n;
+      nonnegative_int extra_dim_degree = 8_n;
+      nonnegative_int degree = 4_n;
 
       ParallelTensorShape par_input = make_input(SumDegree{input_sum_degree},
-                                                 DiscardCopyDegree{1},
+                                                 DiscardCopyDegree{1_n},
                                                  degree,
                                                  extra_dim_degree,
-                                                 1);
+                                                 1_n);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
             make_output(SumDegree{input_sum_degree},
-                        DiscardCopyDegree{1},
+                        DiscardCopyDegree{1_n},
                         degree,
                         extra_dim_degree,
-                        1);
+                        1_n);
         CHECK(result == correct);
       }
 
@@ -185,10 +195,10 @@ TEST_SUITE(FF_TEST_SUITE) {
             get_projection_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
             make_projection(
-                SumDegree{1},
+                SumDegree{1_n},
                 DiscardCopyDegree{input_sum_degree * degree * extra_dim_degree},
-                1,
-                1);
+                1_n,
+                1_n);
         CHECK(result == correct);
       }
 
@@ -198,27 +208,30 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> correct =
             make_bias(SumDegree{input_sum_degree},
                       DiscardCopyDegree{degree * extra_dim_degree},
-                      1);
+                      1_n);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("reduction parallelism") {
-      int input_sum_degree = 2;
-      int degree = 4;
+      nonnegative_int input_sum_degree = 2_n;
+      nonnegative_int degree = 4_n;
 
-      ParallelTensorShape par_input = make_input(
-          SumDegree{input_sum_degree}, DiscardCopyDegree{1}, 1, 1, degree);
+      ParallelTensorShape par_input = make_input(SumDegree{input_sum_degree},
+                                                 DiscardCopyDegree{1_n},
+                                                 1_n,
+                                                 1_n,
+                                                 degree);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
             make_output(SumDegree{input_sum_degree * degree},
-                        DiscardCopyDegree{1},
-                        1,
-                        1,
-                        1);
+                        DiscardCopyDegree{1_n},
+                        1_n,
+                        1_n,
+                        1_n);
         CHECK(result == correct);
       }
 
@@ -226,8 +239,10 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_projection_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_projection(
-                SumDegree{1}, DiscardCopyDegree{input_sum_degree}, degree, 1);
+            make_projection(SumDegree{1_n},
+                            DiscardCopyDegree{input_sum_degree},
+                            degree,
+                            1_n);
         CHECK(result == correct);
       }
 
@@ -235,23 +250,30 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_bias_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct = make_bias(
-            SumDegree{input_sum_degree * degree}, DiscardCopyDegree{1}, 1);
+            SumDegree{input_sum_degree * degree}, DiscardCopyDegree{1_n}, 1_n);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("output channel parallelism") {
-      int input_sum_degree = 2;
-      int degree = 4;
+      nonnegative_int input_sum_degree = 2_n;
+      nonnegative_int degree = 4_n;
 
-      ParallelTensorShape par_input = make_input(
-          SumDegree{input_sum_degree}, DiscardCopyDegree{degree}, 1, 1, 1);
+      ParallelTensorShape par_input = make_input(SumDegree{input_sum_degree},
+                                                 DiscardCopyDegree{degree},
+                                                 1_n,
+                                                 1_n,
+                                                 1_n);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, par_input);
-        tl::expected<ParallelTensorShape, std::string> correct = make_output(
-            SumDegree{input_sum_degree}, DiscardCopyDegree{1}, 1, 1, degree);
+        tl::expected<ParallelTensorShape, std::string> correct =
+            make_output(SumDegree{input_sum_degree},
+                        DiscardCopyDegree{1_n},
+                        1_n,
+                        1_n,
+                        degree);
         CHECK(result == correct);
       }
 
@@ -259,8 +281,10 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_projection_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_projection(
-                SumDegree{1}, DiscardCopyDegree{input_sum_degree}, 1, degree);
+            make_projection(SumDegree{1_n},
+                            DiscardCopyDegree{input_sum_degree},
+                            1_n,
+                            degree);
         CHECK(result == correct);
       }
 
@@ -268,7 +292,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_bias_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct = make_bias(
-            SumDegree{input_sum_degree}, DiscardCopyDegree{1}, degree);
+            SumDegree{input_sum_degree}, DiscardCopyDegree{1_n}, degree);
         CHECK(result == correct);
       }
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc
index 0c14c0fc2a..6c14a226a2 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc
@@ -9,27 +9,27 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("make_adaptive_pool2d") {
-    size_t input_n = 10;
-    size_t input_c = 11;
-    size_t input_h = 15;
-    size_t input_w = 20;
+    nonnegative_int input_n = 10_n;
+    nonnegative_int input_c = 11_n;
+    nonnegative_int input_h = 15_n;
+    nonnegative_int input_w = 20_n;
     Activation activation = Activation::RELU;
     PoolOp op = PoolOp::AVG;
 
-    TensorDims input_dims =
-        TensorDims{FFOrdered<size_t>{input_n, input_c, input_h, input_w}};
+    TensorDims input_dims = TensorDims{
+        FFOrdered<nonnegative_int>{input_n, input_c, input_h, input_w}};
 
     SUBCASE("input_h divisible by output_h && input_w divisible by output_w") {
-      int output_h = 5;
-      int output_w = 2;
+      nonnegative_int output_h = 5_n;
+      nonnegative_int output_w = 2_n;
 
       Pool2DAttrs correct_attrs = Pool2DAttrs{
-          /*kernel_h=*/3,
-          /*kernel_w=*/10,
-          /*stride_h=*/3,
-          /*stride_w=*/10,
-          /*padding_h=*/0,
-          /*padding_w=*/0,
+          /*kernel_h=*/3_n,
+          /*kernel_w=*/10_n,
+          /*stride_h=*/3_n,
+          /*stride_w=*/10_n,
+          /*padding_h=*/0_n,
+          /*padding_w=*/0_n,
           /*pool_type=*/op,
           /*activation=*/activation,
       };
@@ -50,11 +50,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<TensorShape, std::string> result =
             get_output_shape(correct_attrs, input_shape);
         tl::expected<TensorShape, std::string> correct = TensorShape{
-            TensorDims{FFOrdered<size_t>{
+            TensorDims{FFOrdered<nonnegative_int>{
                 input_n,
                 input_c,
-                size_t_from_int(output_h),
-                size_t_from_int(output_w),
+                output_h,
+                output_w,
             }},
             DataType::FLOAT,
         };
@@ -64,8 +64,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("input_h not divisible by output_h") {
-      int output_h = 6;
-      int output_w = 2;
+      nonnegative_int output_h = 6_n;
+      nonnegative_int output_w = 2_n;
 
       std::optional<Pool2DAttrs> result =
           optional_from_expected(make_adaptive_pool2d_attrs(
@@ -76,8 +76,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("input_w not divisible by output_w") {
-      int output_h = 5;
-      int output_w = 3;
+      nonnegative_int output_h = 5_n;
+      nonnegative_int output_w = 3_n;
 
       std::optional<Pool2DAttrs> result =
           optional_from_expected(make_adaptive_pool2d_attrs(
@@ -88,16 +88,16 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("input_h == output_h and input_w == output_w") {
-      int output_h = input_h;
-      int output_w = input_w;
+      nonnegative_int output_h = input_h;
+      nonnegative_int output_w = input_w;
 
       Pool2DAttrs correct_attrs = Pool2DAttrs{
-          /*kernel_h=*/1,
-          /*kernel_w=*/1,
-          /*stride_h=*/1,
-          /*stride_w=*/1,
-          /*padding_h=*/0,
-          /*padding_w=*/0,
+          /*kernel_h=*/1_n,
+          /*kernel_w=*/1_n,
+          /*stride_h=*/1_n,
+          /*stride_w=*/1_n,
+          /*padding_h=*/0_n,
+          /*padding_w=*/0_n,
           /*pool_type=*/op,
           /*activation=*/activation,
       };
@@ -126,22 +126,22 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("get_output_shape(Pool2DAttrs, TensorShape)") {
     Pool2DAttrs attrs = Pool2DAttrs{
-        /*kernel_h=*/3,
-        /*kernel_w=*/2,
-        /*stride_h=*/2,
-        /*stride_w=*/2,
-        /*padding_h=*/1,
-        /*padding_w=*/1,
+        /*kernel_h=*/3_n,
+        /*kernel_w=*/2_n,
+        /*stride_h=*/2_n,
+        /*stride_w=*/2_n,
+        /*padding_h=*/1_n,
+        /*padding_w=*/1_n,
         /*pool_type=*/PoolOp::MAX,
         /*activation=*/std::nullopt,
     };
 
     SUBCASE("fails on non-4d inputs") {
       TensorShape input = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              10,
-              12,
-              14,
+          TensorDims{FFOrdered<nonnegative_int>{
+              10_n,
+              12_n,
+              14_n,
           }},
           DataType::FLOAT,
       };
@@ -155,14 +155,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("4d input") {
       TensorShape input = TensorShape{
-          TensorDims{FFOrdered<size_t>{11, 13, 12, 6}},
+          TensorDims{FFOrdered<nonnegative_int>{11_n, 13_n, 12_n, 6_n}},
           DataType::FLOAT,
       };
 
       tl::expected<TensorShape, std::string> result =
           get_output_shape(attrs, input);
       tl::expected<TensorShape, std::string> correct = TensorShape{
-          TensorDims{FFOrdered<size_t>{11, 13, 6, 4}},
+          TensorDims{FFOrdered<nonnegative_int>{11_n, 13_n, 6_n, 4_n}},
           DataType::FLOAT,
       };
 
@@ -175,12 +175,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     auto make_attrs = [](PoolOp pool_type,
                          std::optional<Activation> const &activation) {
       return Pool2DAttrs{
-          /*kernel_h=*/3,
-          /*kernel_w=*/2,
-          /*stride_h=*/2,
-          /*stride_w=*/2,
-          /*padding_h=*/1,
-          /*padding_w=*/1,
+          /*kernel_h=*/3_n,
+          /*kernel_w=*/2_n,
+          /*stride_h=*/2_n,
+          /*stride_w=*/2_n,
+          /*padding_h=*/1_n,
+          /*padding_w=*/1_n,
           /*pool_type=*/pool_type,
           /*activation=*/activation,
       };
@@ -190,13 +190,13 @@ TEST_SUITE(FF_TEST_SUITE) {
       Pool2DAttrs attrs = make_attrs(PoolOp::MAX, /*activation=*/std::nullopt);
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{1},
-          FFOrdered<int>{
-              4,
-              1,
-              1,
-              1,
+          SumDegree{1_n},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{
+              4_n,
+              1_n,
+              1_n,
+              1_n,
           },
       };
 
@@ -211,13 +211,13 @@ TEST_SUITE(FF_TEST_SUITE) {
       Pool2DAttrs attrs = make_attrs(PoolOp::MAX, /*activation=*/std::nullopt);
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{1},
-          FFOrdered<int>{
-              4,
-              2,
-              5,
-              6,
+          SumDegree{1_n},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{
+              4_n,
+              2_n,
+              5_n,
+              6_n,
           },
       };
 
@@ -232,13 +232,13 @@ TEST_SUITE(FF_TEST_SUITE) {
       Pool2DAttrs attrs = make_attrs(PoolOp::MAX, /*activation=*/std::nullopt);
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{3},
-          FFOrdered<int>{
-              1,
-              1,
-              1,
-              1,
+          SumDegree{1_n},
+          DiscardCopyDegree{3_n},
+          FFOrdered<nonnegative_int>{
+              1_n,
+              1_n,
+              1_n,
+              1_n,
           },
       };
 
@@ -256,13 +256,13 @@ TEST_SUITE(FF_TEST_SUITE) {
               make_attrs(PoolOp::MAX, /*activation=*/std::nullopt);
 
           ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-              SumDegree{2},
-              DiscardCopyDegree{1},
-              FFOrdered<int>{
-                  1,
-                  1,
-                  1,
-                  1,
+              SumDegree{2_n},
+              DiscardCopyDegree{1_n},
+              FFOrdered<nonnegative_int>{
+                  1_n,
+                  1_n,
+                  1_n,
+                  1_n,
               },
           };
 
@@ -279,13 +279,13 @@ TEST_SUITE(FF_TEST_SUITE) {
               make_attrs(PoolOp::AVG, /*activation=*/std::nullopt);
 
           ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-              SumDegree{2},
-              DiscardCopyDegree{1},
-              FFOrdered<int>{
-                  1,
-                  1,
-                  1,
-                  1,
+              SumDegree{2_n},
+              DiscardCopyDegree{1_n},
+              FFOrdered<nonnegative_int>{
+                  1_n,
+                  1_n,
+                  1_n,
+                  1_n,
               },
           };
 
@@ -302,13 +302,13 @@ TEST_SUITE(FF_TEST_SUITE) {
             make_attrs(PoolOp::AVG, /*activation=*/Activation::RELU);
 
         ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-            SumDegree{2},
-            DiscardCopyDegree{1},
-            FFOrdered<int>{
-                1,
-                1,
-                1,
-                1,
+            SumDegree{2_n},
+            DiscardCopyDegree{1_n},
+            FFOrdered<nonnegative_int>{
+                1_n,
+                1_n,
+                1_n,
+                1_n,
             },
         };
 
@@ -326,12 +326,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     // just do a single test to make sure it works/exists
 
     Pool2DAttrs attrs = Pool2DAttrs{
-        /*kernel_h=*/3,
-        /*kernel_w=*/2,
-        /*stride_h=*/2,
-        /*stride_w=*/2,
-        /*padding_h=*/1,
-        /*padding_w=*/1,
+        /*kernel_h=*/3_n,
+        /*kernel_w=*/2_n,
+        /*stride_h=*/2_n,
+        /*stride_w=*/2_n,
+        /*padding_h=*/1_n,
+        /*padding_w=*/1_n,
         /*pool_type=*/PoolOp::MAX,
         /*activation=*/std::nullopt,
     };
@@ -340,14 +340,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelTensorShape input = ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
-                  ShardParallelDim{14, 7},
-                  ShardParallelDim{16, 8},
-                  ShardParallelDim{12, 3},
-                  ShardParallelDim{6, 2},
+                  ShardParallelDim{14_n, 7_n},
+                  ShardParallelDim{16_n, 8_n},
+                  ShardParallelDim{12_n, 3_n},
+                  ShardParallelDim{6_n, 2_n},
               },
               ReplicaParallelDimSet{
-                  SumDegree{1},
-                  DiscardCopyDegree{2},
+                  SumDegree{1_n},
+                  DiscardCopyDegree{2_n},
               },
           },
           DataType::FLOAT,
@@ -359,14 +359,14 @@ TEST_SUITE(FF_TEST_SUITE) {
           ParallelTensorShape{
               ParallelTensorDims{
                   FFOrdered<ShardParallelDim>{
-                      ShardParallelDim{14, 7},
-                      ShardParallelDim{16, 8},
-                      ShardParallelDim{6, 3},
-                      ShardParallelDim{4, 2},
+                      ShardParallelDim{14_n, 7_n},
+                      ShardParallelDim{16_n, 8_n},
+                      ShardParallelDim{6_n, 3_n},
+                      ShardParallelDim{4_n, 2_n},
                   },
                   ReplicaParallelDimSet{
-                      SumDegree{1},
-                      DiscardCopyDegree{2},
+                      SumDegree{1_n},
+                      DiscardCopyDegree{2_n},
                   },
               },
               DataType::FLOAT,
@@ -377,14 +377,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelTensorShape input = ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
-                  ShardParallelDim{14, 1},
-                  ShardParallelDim{16, 1},
-                  ShardParallelDim{12, 1},
-                  ShardParallelDim{6, 1},
+                  ShardParallelDim{14_n, 1_n},
+                  ShardParallelDim{16_n, 1_n},
+                  ShardParallelDim{12_n, 1_n},
+                  ShardParallelDim{6_n, 1_n},
               },
               ReplicaParallelDimSet{
-                  SumDegree{2},
-                  DiscardCopyDegree{1},
+                  SumDegree{2_n},
+                  DiscardCopyDegree{1_n},
               },
           },
           DataType::FLOAT,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/reduction.cc b/lib/op-attrs/test/src/op-attrs/ops/reduction.cc
index 0d1c8bdf98..dc12eb12a8 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/reduction.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/reduction.cc
@@ -10,21 +10,21 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{12, 2},
-                ShardParallelDim{14, 1},
-                ShardParallelDim{16, 3},
-                ShardParallelDim{18, 2},
+                ShardParallelDim{12_n, 2_n},
+                ShardParallelDim{14_n, 1_n},
+                ShardParallelDim{16_n, 3_n},
+                ShardParallelDim{18_n, 2_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{3},
-                DiscardCopyDegree{2},
+                SumDegree{3_n},
+                DiscardCopyDegree{2_n},
             },
         },
         DataType::FLOAT,
     };
 
     SUBCASE("valid") {
-      int degree = 3;
+      nonnegative_int degree = 3_n;
       ReductionAttrs attrs = ReductionAttrs{
           /*repartition_degree=*/degree,
       };
@@ -42,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("invalid") {
-      int degree = 4;
+      nonnegative_int degree = 4_n;
       ReductionAttrs attrs = ReductionAttrs{
           /*repartition_degree=*/degree,
       };
diff --git a/lib/op-attrs/test/src/op-attrs/ops/repartition.cc b/lib/op-attrs/test/src/op-attrs/ops/repartition.cc
index ba213f54f4..36a265ce9f 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/repartition.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/repartition.cc
@@ -6,8 +6,8 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Repartition shape inference") {
-    ff_dim_t dim = ff_dim_t{nonnegative_int{2}};
-    int degree = 4;
+    ff_dim_t dim = ff_dim_t{2_n};
+    nonnegative_int degree = 4_n;
     RepartitionAttrs attrs = RepartitionAttrs{
         /*repartition_dim=*/dim,
         /*repartition_degree=*/degree,
@@ -16,14 +16,14 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{12, 2},
-                ShardParallelDim{14, 1},
-                ShardParallelDim{16, 3},
-                ShardParallelDim{18, 2},
+                ShardParallelDim{12_n, 2_n},
+                ShardParallelDim{14_n, 1_n},
+                ShardParallelDim{16_n, 3_n},
+                ShardParallelDim{18_n, 2_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{3},
-                DiscardCopyDegree{2},
+                SumDegree{3_n},
+                DiscardCopyDegree{2_n},
             },
         },
         DataType::FLOAT,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/replicate.cc b/lib/op-attrs/test/src/op-attrs/ops/replicate.cc
index 60a1018479..770ae20d38 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/replicate.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/replicate.cc
@@ -6,20 +6,20 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Replicate shape inference") {
     ReplicateAttrs attrs = ReplicateAttrs{
-        /*replicate_degree=*/4,
+        /*replicate_degree=*/4_n,
     };
 
     ParallelTensorShape input = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{12, 1},
-                ShardParallelDim{14, 2},
-                ShardParallelDim{16, 2},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{12_n, 1_n},
+                ShardParallelDim{14_n, 2_n},
+                ShardParallelDim{16_n, 2_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{3},
-                DiscardCopyDegree{2},
+                SumDegree{3_n},
+                DiscardCopyDegree{2_n},
             },
         },
         DataType::FLOAT,
@@ -28,7 +28,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape result = get_output_shape(attrs, input);
 
     ParallelTensorShape correct_output = input;
-    correct_output.dims.replica_dims.discard_copy_degree = DiscardCopyDegree{8};
+    correct_output.dims.replica_dims.discard_copy_degree =
+        DiscardCopyDegree{8_n};
 
     CHECK(result == correct_output);
   }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/softmax.cc b/lib/op-attrs/test/src/op-attrs/ops/softmax.cc
index 5808e5ef42..8c80e348c0 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/softmax.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/softmax.cc
@@ -10,16 +10,16 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_output_shape(SoftmaxAttrs, TensorShape)") {
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            12,
-            14,
-            16,
+        TensorDims{FFOrdered<nonnegative_int>{
+            12_n,
+            14_n,
+            16_n,
         }},
         DataType::FLOAT,
     };
 
     SUBCASE("attrs.dim in bounds") {
-      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}};
+      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}};
 
       tl::expected<TensorShape, std::string> result =
           get_output_shape(attrs, input);
@@ -29,7 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("attrs.dims out of bounds") {
-      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{4}}};
+      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{4_n}};
 
       std::optional<TensorShape> result =
           optional_from_expected(get_output_shape(attrs, input));
@@ -41,47 +41,53 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("get_output_shape(SoftmaxAttrs, ParallelTensorShape)") {
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            12,
-            14,
-            16,
+        TensorDims{FFOrdered<nonnegative_int>{
+            12_n,
+            14_n,
+            16_n,
         }},
         DataType::FLOAT,
     };
     TensorShape output = input;
 
-    auto make_input =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              input, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto make_input = [&](SumDegree o_sum,
+                          DiscardCopyDegree o_eq,
+                          nonnegative_int o0,
+                          nonnegative_int o1,
+                          nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
-    auto make_output =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              output, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto make_output = [&](SumDegree o_sum,
+                           DiscardCopyDegree o_eq,
+                           nonnegative_int o0,
+                           nonnegative_int o1,
+                           nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          output, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
     SUBCASE("partition parallelism in non-softmax-dim (valid)") {
-      int degree0 = 2;
-      int degree2 = 4;
+      nonnegative_int degree0 = 2_n;
+      nonnegative_int degree2 = 4_n;
 
-      ParallelTensorShape par_input =
-          make_input(SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2);
+      ParallelTensorShape par_input = make_input(
+          SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2);
 
       SUBCASE("attrs.dim in bounds") {
-        SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}};
+        SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}};
 
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct = make_output(
-            SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2);
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2);
 
         CHECK(result == correct);
       }
 
       SUBCASE("attrs.dims out of bounds") {
-        SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{4}}};
+        SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{4_n}};
 
         std::optional<ParallelTensorShape> result =
             optional_from_expected(get_output_shape(attrs, par_input));
@@ -92,12 +98,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("partition parallism in softmax dim (invalid)") {
-      int degree1 = 2;
+      nonnegative_int degree1 = 2_n;
 
-      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}};
+      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}};
 
       ParallelTensorShape par_input =
-          make_input(SumDegree{1}, DiscardCopyDegree{1}, 1, degree1, 1);
+          make_input(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree1, 1_n);
 
       std::optional<ParallelTensorShape> result =
           optional_from_expected(get_output_shape(attrs, par_input));
@@ -107,12 +113,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("sum parallelism (invalid)") {
-      SumDegree sum_degree = SumDegree{2};
+      SumDegree sum_degree = SumDegree{2_n};
 
-      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}};
+      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}};
 
       ParallelTensorShape par_input =
-          make_input(sum_degree, DiscardCopyDegree{1}, 1, 1, 1);
+          make_input(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
       std::optional<ParallelTensorShape> result =
           optional_from_expected(get_output_shape(attrs, par_input));
@@ -122,12 +128,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("discard copy parallelism (invalid)") {
-      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2};
+      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n};
 
-      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}};
+      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}};
 
       ParallelTensorShape par_input =
-          make_input(SumDegree{1}, discard_copy_degree, 1, 1, 1);
+          make_input(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n);
 
       std::optional<ParallelTensorShape> result =
           optional_from_expected(get_output_shape(attrs, par_input));
diff --git a/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc b/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc
index 73f5f0674d..1187bfcfbf 100644
--- a/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc
+++ b/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc
@@ -6,8 +6,8 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("PCGOperatorAttrs to/from json") {
     PCGOperatorAttrs correct = PCGOperatorAttrs{RepartitionAttrs{
-        /*repartition_dim=*/ff_dim_t{nonnegative_int{1}},
-        /*repartition_degree=*/4,
+        /*repartition_dim=*/ff_dim_t{1_n},
+        /*repartition_degree=*/4_n,
     }};
     nlohmann::json j = correct;
     auto result = j.get<PCGOperatorAttrs>();
diff --git a/lib/op-attrs/test/src/op-attrs/relative_ff_dim_t.cc b/lib/op-attrs/test/src/op-attrs/relative_ff_dim_t.cc
index c09c1ec3df..e3f3f4534e 100644
--- a/lib/op-attrs/test/src/op-attrs/relative_ff_dim_t.cc
+++ b/lib/op-attrs/test/src/op-attrs/relative_ff_dim_t.cc
@@ -5,13 +5,13 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ff_dim_t_from_relative_ff_dim_t") {
-    int input_dim = 5;
+    nonnegative_int input_dim = 5_n;
 
     SUBCASE("relative index is zero") {
       relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{0};
       ff_dim_t ff_dim =
           ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim);
-      CHECK(ff_dim == ff_dim_t{nonnegative_int{0}});
+      CHECK(ff_dim == ff_dim_t{0_n});
     }
 
     SUBCASE("relative index is positive") {
@@ -20,14 +20,14 @@ TEST_SUITE(FF_TEST_SUITE) {
         relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{1};
         ff_dim_t ff_dim =
             ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim);
-        CHECK(ff_dim == ff_dim_t{nonnegative_int{1}});
+        CHECK(ff_dim == ff_dim_t{1_n});
       }
 
       SUBCASE("relative index is out of range") {
         relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{10};
         ff_dim_t ff_dim =
             ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim);
-        CHECK(ff_dim == ff_dim_t{nonnegative_int{10}});
+        CHECK(ff_dim == ff_dim_t{10_n});
       }
     }
 
@@ -37,7 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{-1};
         ff_dim_t ff_dim =
             ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim);
-        CHECK(ff_dim == ff_dim_t{nonnegative_int{4}});
+        CHECK(ff_dim == ff_dim_t{4_n});
       }
 
       SUBCASE("relative index is out of range") {
diff --git a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc
index 60d87300c1..7e072d82d9 100644
--- a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc
+++ b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc
@@ -7,7 +7,8 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("tensor_dims_is_broadcastable_to(TensorDims, TensorDims)") {
 
-    TensorDims goal = TensorDims{FFOrdered<size_t>{1, 1, 4, 3}};
+    TensorDims goal =
+        TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 4_n, 3_n}};
 
     SUBCASE("dims match") {
       bool result = tensor_dims_is_broadcastable_to(goal, goal);
@@ -17,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("curr only needs num_dims promotion") {
-      TensorDims curr = TensorDims{FFOrdered<size_t>{4, 3}};
+      TensorDims curr = TensorDims{FFOrdered<nonnegative_int>{4_n, 3_n}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = true;
@@ -26,7 +27,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("curr only needs dim expansion") {
-      TensorDims curr = TensorDims{FFOrdered<size_t>{1, 1, 1, 3}};
+      TensorDims curr =
+          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 3_n}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = true;
@@ -35,7 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("curr needs both num_dims promotion and dim expansion") {
-      TensorDims curr = TensorDims{FFOrdered<size_t>{1, 3}};
+      TensorDims curr = TensorDims{FFOrdered<nonnegative_int>{1_n, 3_n}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = true;
@@ -44,7 +46,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("curr needs invalid dim promotion") {
-      TensorDims curr = TensorDims{FFOrdered<size_t>{1, 1, 2, 3}};
+      TensorDims curr =
+          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 2_n, 3_n}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = false;
@@ -53,7 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("num_dims(goal) < num_dims(curr)") {
-      TensorDims curr = TensorDims{FFOrdered<size_t>{1, 1, 10, 4, 3}};
+      TensorDims curr =
+          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 10_n, 4_n, 3_n}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = false;
@@ -63,12 +67,13 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("get_broadcast_target_dims(std::unordered_set<TensorDims>)") {
-    TensorDims d1 = TensorDims{FFOrdered<size_t>{1, 10, 4, 3}};
+    TensorDims d1 = TensorDims{FFOrdered<nonnegative_int>{1_n, 10_n, 4_n, 3_n}};
 
-    TensorDims d2 = TensorDims{FFOrdered<size_t>{10, 4, 1}};
+    TensorDims d2 = TensorDims{FFOrdered<nonnegative_int>{10_n, 4_n, 1_n}};
 
     SUBCASE("has target in inputs") {
-      TensorDims d3 = TensorDims{FFOrdered<size_t>{1, 1, 4, 3}};
+      TensorDims d3 =
+          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 4_n, 3_n}};
 
       std::optional<TensorDims> result =
           get_broadcast_target_dims({d1, d2, d3});
@@ -78,7 +83,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("has no possible target") {
-      TensorDims d3 = TensorDims{FFOrdered<size_t>{1, 1, 1, 4}};
+      TensorDims d3 =
+          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 4_n}};
 
       std::optional<TensorDims> result =
           get_broadcast_target_dims({d1, d2, d3});
@@ -88,10 +94,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("has possible target, but not in inputs") {
-      TensorDims d3 = TensorDims{FFOrdered<size_t>{1, 1, 1, 4, 3}};
+      TensorDims d3 =
+          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 4_n, 3_n}};
 
       TensorDims possible_target =
-          TensorDims{FFOrdered<size_t>{1, 1, 10, 4, 3}};
+          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 10_n, 4_n, 3_n}};
 
       REQUIRE(tensor_dims_is_broadcastable_to(d1, possible_target));
       REQUIRE(tensor_dims_is_broadcastable_to(d2, possible_target));
diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h
index df93f69f2e..290df8574e 100644
--- a/lib/pcg/include/pcg/computation_graph_builder.h
+++ b/lib/pcg/include/pcg/computation_graph_builder.h
@@ -85,15 +85,15 @@ struct ComputationGraphBuilder {
   // Add a 2D convolutional layer
   tensor_guid_t conv2d(
       tensor_guid_t const &input,
-      int outChannels,
-      int kernelH,
-      int kernelW,
-      int strideH,
-      int strideW,
-      int paddingH,
-      int paddingW,
+      nonnegative_int outChannels,
+      nonnegative_int kernelH,
+      nonnegative_int kernelW,
+      nonnegative_int strideH,
+      nonnegative_int strideW,
+      nonnegative_int paddingH,
+      nonnegative_int paddingW,
       std::optional<Activation> const &activation = std::nullopt,
-      int groups = 1,
+      nonnegative_int groups = 1_n,
       bool use_bias = true,
       std::optional<InitializerAttrs> const &kernel_initializer = std::nullopt,
       std::optional<InitializerAttrs> const &bias_initializer = std::nullopt,
@@ -107,8 +107,8 @@ struct ComputationGraphBuilder {
   // Add an embedding layer
   tensor_guid_t embedding(
       tensor_guid_t const &input,
-      int num_entries,
-      int outDim,
+      nonnegative_int num_entries,
+      nonnegative_int outDim,
       AggregateOp aggr,
       DataType dtype = DataType::FLOAT,
       std::optional<InitializerAttrs> const &kernel_initializer = std::nullopt,
@@ -121,32 +121,32 @@ struct ComputationGraphBuilder {
   // Add a cache layer
   tensor_guid_t
       cache(tensor_guid_t const &input,
-            int num_batches,
+            nonnegative_int num_batches,
             std::function<float(float *, void const *, void const *, int)>
                 score_f = {},
             std::optional<std::string> const &name = std::nullopt);
   // Add a 2D pooling layer
   tensor_guid_t
       pool2d(tensor_guid_t const &input,
-             int kernelH,
-             int kernelW,
-             int strideH,
-             int strideW,
-             int paddingH,
-             int paddingW,
+             nonnegative_int kernelH,
+             nonnegative_int kernelW,
+             nonnegative_int strideH,
+             nonnegative_int strideW,
+             nonnegative_int paddingH,
+             nonnegative_int paddingW,
              PoolOp type = PoolOp::MAX,
              std::optional<Activation> const &activation = std::nullopt,
              std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t adaptive_pool2d(
       tensor_guid_t const &input,
-      int output_h,
-      int output_w,
+      nonnegative_int output_h,
+      nonnegative_int output_w,
       PoolOp type = PoolOp::MAX,
       std::optional<Activation> const &activation = std::nullopt,
       std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t
       layer_norm(tensor_guid_t const &input,
-                 std::vector<int> const &axes,
+                 std::vector<relative_ff_dim_t> const &axes,
                  bool elementwise_affine,
                  float eps,
                  std::optional<std::string> const &name = std::nullopt);
@@ -157,15 +157,15 @@ struct ComputationGraphBuilder {
                  float eps,
                  std::optional<float> const &momentum,
                  std::optional<std::string> const &name = std::nullopt);
-  tensor_guid_t
-      batch_matmul(tensor_guid_t const &A,
-                   tensor_guid_t const &B,
-                   int a_seq_length_dim = -1,
-                   int b_seq_length_dim = -1,
-                   std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t batch_matmul(
+      tensor_guid_t const &A,
+      tensor_guid_t const &B,
+      std::optional<nonnegative_int> const &a_seq_length_dim = std::nullopt,
+      std::optional<nonnegative_int> const &b_seq_length_dim = std::nullopt,
+      std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t dense(
       tensor_guid_t const &input,
-      int outDim,
+      nonnegative_int outDim,
       std::optional<Activation> activation = std::nullopt,
       bool use_bias = true,
       DataType data_type = DataType::FLOAT,
@@ -181,7 +181,7 @@ struct ComputationGraphBuilder {
                      std::optional<std::string> const &name = std::nullopt);
   // Add a concat layer
   tensor_guid_t concat(std::vector<tensor_guid_t> const &tensors,
-                       int axis,
+                       relative_ff_dim_t axis,
                        std::optional<std::string> const &name = std::nullopt);
   // Add a mean layer
   tensor_guid_t mean(tensor_guid_t const &input,
@@ -191,47 +191,48 @@ struct ComputationGraphBuilder {
   // Add a split layer
   std::vector<tensor_guid_t>
       split(tensor_guid_t const &input,
-            std::vector<int> const &split,
-            int axis,
+            std::vector<nonnegative_int> const &split,
+            relative_ff_dim_t axis,
             std::optional<std::string> const &name = std::nullopt);
   // Add a flat layer
-  tensor_guid_t flat(tensor_guid_t const &input,
-                     int start_dim = 0,
-                     std::optional<int> const &end_dim = std::nullopt,
-                     std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t
+      flat(tensor_guid_t const &input,
+           relative_ff_dim_t start_dim = relative_ff_dim_t{0},
+           std::optional<relative_ff_dim_t> const &end_dim = std::nullopt,
+           std::optional<std::string> const &name = std::nullopt);
   // Add a softmax layer
   tensor_guid_t softmax(tensor_guid_t const &input,
-                        std::optional<int> dim = std::nullopt,
+                        std::optional<relative_ff_dim_t> dim = std::nullopt,
                         std::optional<std::string> const &name = std::nullopt);
   // Create input tensors and constants
   tensor_guid_t
       transpose(tensor_guid_t const &input,
-                std::vector<int> const &perm,
+                std::vector<nonnegative_int> const &perm,
                 std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t
       reduce_sum(tensor_guid_t const &input,
-                 std::vector<int> const &axes,
+                 std::vector<relative_ff_dim_t> const &axes,
                  bool keepdims = false,
                  std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t reshape(tensor_guid_t const &input,
-                        std::vector<int> const &shape,
+                        std::vector<nonnegative_int> const &shape,
                         std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t reverse(tensor_guid_t const &input,
-                        int axis,
+                        relative_ff_dim_t axis,
                         std::optional<std::string> const &name = std::nullopt);
   std::vector<tensor_guid_t>
       top_k(tensor_guid_t const &input,
-            int k,
+            nonnegative_int k,
             bool sorted,
             std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t multihead_attention(
       tensor_guid_t const &query,
       tensor_guid_t const &key,
       tensor_guid_t const &value,
-      int embed_dim,
-      int num_heads,
-      int kdim = 0,
-      int vdim = 0,
+      nonnegative_int embed_dim,
+      nonnegative_int num_heads,
+      nonnegative_int kdim = 0_n,
+      nonnegative_int vdim = 0_n,
       float dropout = 0.0f,
       bool bias = true,
       bool add_bias_kv = false,
@@ -254,7 +255,7 @@ struct ComputationGraphBuilder {
                     std::optional<std::string> const &name = std::nullopt);
 
   std::vector<tensor_guid_t> get_outputs(LayerAttrs const &) const;
-  tensor_guid_t get_output(LayerAttrs const &, int idx) const;
+  tensor_guid_t get_output(LayerAttrs const &, nonnegative_int idx) const;
 
   std::vector<tensor_guid_t>
       add_layer(LayerAttrs const &layer,
diff --git a/lib/pcg/include/pcg/cpu_id_t.struct.toml b/lib/pcg/include/pcg/cpu_id_t.struct.toml
index 0492a937be..152debbded 100644
--- a/lib/pcg/include/pcg/cpu_id_t.struct.toml
+++ b/lib/pcg/include/pcg/cpu_id_t.struct.toml
@@ -9,6 +9,10 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "cpu_index"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/pcg/include/pcg/device_id.h b/lib/pcg/include/pcg/device_id.h
index 28cf30eaba..36ea9de6b3 100644
--- a/lib/pcg/include/pcg/device_id.h
+++ b/lib/pcg/include/pcg/device_id.h
@@ -13,9 +13,9 @@ device_id_t operator+(device_id_t, size_t);
 DeviceType get_device_type(device_id_t const &device_id);
 gpu_id_t unwrap_gpu(device_id_t);
 cpu_id_t unwrap_cpu(device_id_t);
-int get_raw_id(device_id_t);
+nonnegative_int get_raw_id(device_id_t);
 
-device_id_t device_id_from_index(int, DeviceType);
+device_id_t device_id_from_index(nonnegative_int, DeviceType);
 
 } // namespace FlexFlow
 
diff --git a/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.h b/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.h
index 05c486f0f7..9554995fa0 100644
--- a/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.h
+++ b/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.h
@@ -8,7 +8,7 @@ namespace FlexFlow {
 
 V1DataflowGraph to_v1(DataflowGraphView const &);
 V1DataflowGraph to_v1(DataflowGraphView const &,
-                      std::unordered_map<Node, int> const &);
+                      std::unordered_map<Node, nonnegative_int> const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.struct.toml b/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.struct.toml
index c332b6b41d..57b559a18e 100644
--- a/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.struct.toml
+++ b/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.struct.toml
@@ -13,6 +13,7 @@ includes = [
   "<vector>",
   "<unordered_set>",
   "pcg/file_format/v1/graphs/v1_graph_edge.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -24,7 +25,7 @@ src_includes = [
 
 [[fields]]
 name = "nodes"
-type = "std::vector<int>"
+type = "std::vector<::FlexFlow::nonnegative_int>"
 
 [[fields]]
 name = "edges"
diff --git a/lib/pcg/include/pcg/file_format/v1/graphs/v1_graph_edge.struct.toml b/lib/pcg/include/pcg/file_format/v1/graphs/v1_graph_edge.struct.toml
index 752706fe1d..9150c20056 100644
--- a/lib/pcg/include/pcg/file_format/v1/graphs/v1_graph_edge.struct.toml
+++ b/lib/pcg/include/pcg/file_format/v1/graphs/v1_graph_edge.struct.toml
@@ -9,18 +9,22 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "srcNode"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "srcIdx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "dstNode"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "dstIdx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.h b/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.h
index fc9dfcef9a..426bad5a82 100644
--- a/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.h
+++ b/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.h
@@ -13,18 +13,19 @@
 namespace FlexFlow {
 
 template <typename NodeLabel, typename OutputLabel>
-std::pair<V1LabelledDataflowGraph<NodeLabel, OutputLabel>, bidict<int, Node>>
+std::pair<V1LabelledDataflowGraph<NodeLabel, OutputLabel>,
+          bidict<nonnegative_int, Node>>
     to_v1_including_node_numbering(
         LabelledDataflowGraphView<NodeLabel, OutputLabel> const &g) {
 
-  bidict<int, Node> nodes = bidict_from_enumerating(get_nodes(g));
+  bidict<nonnegative_int, Node> nodes = bidict_from_enumerating(get_nodes(g));
 
   V1DataflowGraph unlabelled = to_v1(g, nodes.reversed());
 
-  std::unordered_map<int, NodeLabel> node_labels = map_values(
+  std::unordered_map<nonnegative_int, NodeLabel> node_labels = map_values(
       nodes.as_unordered_map(), [&](Node const &n) { return g.at(n); });
 
-  std::unordered_map<int, std::vector<OutputLabel>> output_labels =
+  std::unordered_map<nonnegative_int, std::vector<OutputLabel>> output_labels =
       map_values(nodes.as_unordered_map(), [&](Node const &n) {
         return transform(get_outputs(g, n),
                          [&](DataflowOutput const &o) { return g.at(o); });
diff --git a/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.struct.toml b/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.struct.toml
index b440d0f03d..1f69f5cd93 100644
--- a/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.struct.toml
+++ b/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.struct.toml
@@ -18,6 +18,7 @@ includes = [
   "<unordered_map>",
   "pcg/file_format/v1/graphs/v1_dataflow_graph.dtg.h",
   "pcg/file_format/v1/graphs/v1_graph_output.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -29,11 +30,11 @@ src_includes = [
 
 [[fields]]
 name = "node_labels"
-type = "std::unordered_map<int, NodeLabel>"
+type = "std::unordered_map<::FlexFlow::nonnegative_int, NodeLabel>"
 
 [[fields]]
 name = "output_labels"
-type = "std::unordered_map<int, std::vector<OutputLabel>>"
+type = "std::unordered_map<::FlexFlow::nonnegative_int, std::vector<OutputLabel>>"
 
 [[fields]]
 name = "graph"
diff --git a/lib/pcg/include/pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_sp_decomposition.variant.toml b/lib/pcg/include/pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_sp_decomposition.variant.toml
index 0fe0b1761f..bd60564465 100644
--- a/lib/pcg/include/pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_sp_decomposition.variant.toml
+++ b/lib/pcg/include/pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_sp_decomposition.variant.toml
@@ -9,6 +9,7 @@ features = [
 includes = [
   "pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_series_split.dtg.h",
   "pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_parallel_split.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[values]]
@@ -20,5 +21,5 @@ type = "::FlexFlow::V1BinaryParallelSplit"
 key = "parallel"
 
 [[values]]
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 key = "leaf"
diff --git a/lib/pcg/include/pcg/file_format/v1/v1_computation_graph.h b/lib/pcg/include/pcg/file_format/v1/v1_computation_graph.h
index 5590d6999b..c0e9966425 100644
--- a/lib/pcg/include/pcg/file_format/v1/v1_computation_graph.h
+++ b/lib/pcg/include/pcg/file_format/v1/v1_computation_graph.h
@@ -9,7 +9,7 @@ namespace FlexFlow {
 
 V1ComputationGraph to_v1(ComputationGraph const &);
 
-std::pair<V1ComputationGraph, bidict<int, layer_guid_t>>
+std::pair<V1ComputationGraph, bidict<nonnegative_int, layer_guid_t>>
     to_v1_including_node_numbering(ComputationGraph const &);
 
 } // namespace FlexFlow
diff --git a/lib/pcg/include/pcg/gpu_id_t.struct.toml b/lib/pcg/include/pcg/gpu_id_t.struct.toml
index 170dbb96fa..7a85b4c0a7 100644
--- a/lib/pcg/include/pcg/gpu_id_t.struct.toml
+++ b/lib/pcg/include/pcg/gpu_id_t.struct.toml
@@ -9,6 +9,10 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "gpu_index"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/pcg/include/pcg/machine_space_coordinate.struct.toml b/lib/pcg/include/pcg/machine_space_coordinate.struct.toml
index 9b197a74c9..2528eab849 100644
--- a/lib/pcg/include/pcg/machine_space_coordinate.struct.toml
+++ b/lib/pcg/include/pcg/machine_space_coordinate.struct.toml
@@ -11,15 +11,16 @@ features = [
 
 includes = [ 
   "pcg/device_type.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
 name = "node_idx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "device_idx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "device_type"
diff --git a/lib/pcg/include/pcg/machine_specification.h b/lib/pcg/include/pcg/machine_specification.h
index 39591e8a70..11c5a81bba 100644
--- a/lib/pcg/include/pcg/machine_specification.h
+++ b/lib/pcg/include/pcg/machine_specification.h
@@ -8,12 +8,12 @@
 
 namespace FlexFlow {
 
-int get_num_gpus(MachineSpecification const &ms);
-int get_num_cpus(MachineSpecification const &ms);
-int get_num_devices(MachineSpecification const &ms,
-                    DeviceType const &device_type);
-int get_num_devices_per_node(MachineSpecification const &ms,
-                             DeviceType const &device_type);
+nonnegative_int get_num_gpus(MachineSpecification const &ms);
+nonnegative_int get_num_cpus(MachineSpecification const &ms);
+nonnegative_int get_num_devices(MachineSpecification const &ms,
+                                DeviceType const &device_type);
+nonnegative_int get_num_devices_per_node(MachineSpecification const &ms,
+                                         DeviceType const &device_type);
 
 bool is_valid_machine_space_coordinate(MachineSpecification const &ms,
                                        MachineSpaceCoordinate const &coord);
diff --git a/lib/pcg/include/pcg/machine_specification.struct.toml b/lib/pcg/include/pcg/machine_specification.struct.toml
index e75b5018cb..7c624c7240 100644
--- a/lib/pcg/include/pcg/machine_specification.struct.toml
+++ b/lib/pcg/include/pcg/machine_specification.struct.toml
@@ -9,17 +9,21 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "num_nodes"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_cpus_per_node"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_gpus_per_node"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "inter_node_bandwidth"
diff --git a/lib/pcg/include/pcg/machine_view.h b/lib/pcg/include/pcg/machine_view.h
index f72b2359dc..6ed9e7dd9c 100644
--- a/lib/pcg/include/pcg/machine_view.h
+++ b/lib/pcg/include/pcg/machine_view.h
@@ -5,7 +5,7 @@
 #include "machine_view.dtg.h"
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/operator_task_space.dtg.h"
-#include "task_space_coordinate.dtg.h"
+#include "pcg/task_space_coordinate.dtg.h"
 #include <cstddef>
 #include <optional>
 #include <unordered_set>
diff --git a/lib/pcg/include/pcg/operator_task_space.h b/lib/pcg/include/pcg/operator_task_space.h
index 1a19397c72..b095fad088 100644
--- a/lib/pcg/include/pcg/operator_task_space.h
+++ b/lib/pcg/include/pcg/operator_task_space.h
@@ -16,8 +16,8 @@ std::unordered_set<TaskSpaceCoordinate>
 TaskSpaceCoordinate
     get_task_space_maximum_coordinate(OperatorTaskSpace const &task);
 
-size_t num_dims(OperatorTaskSpace const &task);
-size_t num_tasks(OperatorTaskSpace const &task);
+nonnegative_int num_dims(OperatorTaskSpace const &task);
+nonnegative_int num_tasks(OperatorTaskSpace const &task);
 
 OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
                                           parallel_layer_guid_t const &layer);
diff --git a/lib/pcg/include/pcg/operator_task_space.struct.toml b/lib/pcg/include/pcg/operator_task_space.struct.toml
index 3ab8b83173..9cc4f6b93a 100644
--- a/lib/pcg/include/pcg/operator_task_space.struct.toml
+++ b/lib/pcg/include/pcg/operator_task_space.struct.toml
@@ -11,6 +11,7 @@ features = [
 
 includes = [ 
   "<vector>",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -20,4 +21,4 @@ src_includes = [
 
 [[fields]]
 name = "degrees"
-type = "std::vector<int>"
+type = "std::vector<::FlexFlow::nonnegative_int>"
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
index 019b120936..faa9b73d95 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
@@ -33,15 +33,15 @@ struct ParallelComputationGraphBuilder {
 
   parallel_tensor_guid_t conv2d(
       parallel_tensor_guid_t const &input,
-      int outChannels,
-      int kernelH,
-      int kernelW,
-      int strideH,
-      int strideW,
-      int paddingH,
-      int paddingW,
+      nonnegative_int outChannels,
+      nonnegative_int kernelH,
+      nonnegative_int kernelW,
+      nonnegative_int strideH,
+      nonnegative_int strideW,
+      nonnegative_int paddingH,
+      nonnegative_int paddingW,
       std::optional<Activation> const &activation = std::nullopt,
-      int groups = 1,
+      nonnegative_int groups = 1_n,
       bool use_bias = true,
       std::optional<InitializerAttrs> const &kernel_initializer = std::nullopt,
       std::optional<InitializerAttrs> const &bias_initializer = std::nullopt,
@@ -50,7 +50,7 @@ struct ParallelComputationGraphBuilder {
 
   parallel_tensor_guid_t dense(
       parallel_tensor_guid_t const &input,
-      int outDim,
+      nonnegative_int outDim,
       std::optional<Activation> activation = std::nullopt,
       bool use_bias = true,
       DataType data_type = DataType::FLOAT,
@@ -61,8 +61,8 @@ struct ParallelComputationGraphBuilder {
 
   parallel_tensor_guid_t embedding(
       parallel_tensor_guid_t const &input,
-      int num_entries,
-      int outDim,
+      nonnegative_int num_entries,
+      nonnegative_int outDim,
       AggregateOp aggr,
       DataType dtype = DataType::FLOAT,
       std::optional<InitializerAttrs> const &kernel_initializer = std::nullopt,
@@ -72,10 +72,10 @@ struct ParallelComputationGraphBuilder {
       parallel_tensor_guid_t const &query,
       parallel_tensor_guid_t const &key,
       parallel_tensor_guid_t const &value,
-      int embed_dim,
-      int num_heads,
-      std::optional<int> kdim = std::nullopt,
-      std::optional<int> vdim = std::nullopt,
+      nonnegative_int embed_dim,
+      nonnegative_int num_heads,
+      std::optional<nonnegative_int> kdim = std::nullopt,
+      std::optional<nonnegative_int> vdim = std::nullopt,
       float dropout = 0.0f,
       bool bias = true,
       bool add_bias_kv = false,
@@ -120,20 +120,20 @@ struct ParallelComputationGraphBuilder {
   parallel_tensor_guid_t
       parallel_partition(parallel_tensor_guid_t const &x,
                          ff_dim_t dim,
-                         int degree,
+                         nonnegative_int degree,
                          std::optional<std::string> const &name = std::nullopt);
   parallel_tensor_guid_t
       parallel_combine(parallel_tensor_guid_t const &x,
                        ff_dim_t dim,
-                       int degree,
+                       nonnegative_int degree,
                        std::optional<std::string> const &name = std::nullopt);
   parallel_tensor_guid_t
       parallel_replicate(parallel_tensor_guid_t const &x,
-                         int degree,
+                         nonnegative_int degree,
                          std::optional<std::string> const &name = std::nullopt);
   parallel_tensor_guid_t
       parallel_reduce(parallel_tensor_guid_t const &x,
-                      int degree,
+                      nonnegative_int degree,
                       std::optional<std::string> const &name = std::nullopt);
 
 private:
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_edge.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_edge.h
index 7aac8558e4..5bce560020 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_edge.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_edge.h
@@ -11,7 +11,7 @@ parallel_tensor_guid_t
     get_parallel_tensor(ParallelComputationGraphEdge const &);
 parallel_layer_guid_t get_src_layer(ParallelComputationGraphEdge const &);
 parallel_layer_guid_t get_dst_layer(ParallelComputationGraphEdge const &);
-int get_dst_layer_input_idx(ParallelComputationGraphEdge const &);
+nonnegative_int get_dst_layer_input_idx(ParallelComputationGraphEdge const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/pcg/include/pcg/start_invariant_machine_view.h b/lib/pcg/include/pcg/start_invariant_machine_view.h
index f5091c69d1..cdf17213f9 100644
--- a/lib/pcg/include/pcg/start_invariant_machine_view.h
+++ b/lib/pcg/include/pcg/start_invariant_machine_view.h
@@ -17,7 +17,7 @@ MachineView
 StartInvariantMachineView
     start_invariant_from_machine_view(MachineView const &mv);
 
-size_t num_dims(StartInvariantMachineView const &mv);
+nonnegative_int num_dims(StartInvariantMachineView const &mv);
 
 DeviceType get_device_type(StartInvariantMachineView const &mv);
 
diff --git a/lib/pcg/include/pcg/stride_t.struct.toml b/lib/pcg/include/pcg/stride_t.struct.toml
index a764497b8b..8d950c5f39 100644
--- a/lib/pcg/include/pcg/stride_t.struct.toml
+++ b/lib/pcg/include/pcg/stride_t.struct.toml
@@ -9,6 +9,10 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "unwrapped"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/pcg/include/pcg/task_space_coordinate.struct.toml b/lib/pcg/include/pcg/task_space_coordinate.struct.toml
index 65aea167cb..1057676b8e 100644
--- a/lib/pcg/include/pcg/task_space_coordinate.struct.toml
+++ b/lib/pcg/include/pcg/task_space_coordinate.struct.toml
@@ -11,6 +11,7 @@ features = [
 
 includes = [ 
   "<vector>",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -20,4 +21,4 @@ src_includes = [
 
 [[fields]]
 name = "raw_coord"
-type = "std::vector<int>"
+type = "std::vector<::FlexFlow::nonnegative_int>"
diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc
index 09772fa9d9..0d07c43a91 100644
--- a/lib/pcg/src/pcg/computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/computation_graph_builder.cc
@@ -376,30 +376,32 @@ tensor_guid_t
 
 tensor_guid_t ComputationGraphBuilder::conv2d(
     tensor_guid_t const &x,
-    int outChannels,
-    int kernelH,
-    int kernelW,
-    int strideH,
-    int strideW,
-    int paddingH,
-    int paddingW,
+    nonnegative_int outChannels,
+    nonnegative_int kernelH,
+    nonnegative_int kernelW,
+    nonnegative_int strideH,
+    nonnegative_int strideW,
+    nonnegative_int paddingH,
+    nonnegative_int paddingW,
     std::optional<Activation> const &activation,
-    int groups,
+    nonnegative_int groups,
     bool use_bias,
     std::optional<InitializerAttrs> const &kernel_initializer,
     std::optional<InitializerAttrs> const &bias_initializer,
     std::optional<RegularizerAttrs> const &kernel_regularizer,
     std::optional<std::string> const &maybe_name) {
-  Conv2DAttrs attrs = Conv2DAttrs{outChannels,
-                                  kernelH,
-                                  kernelW,
-                                  strideH,
-                                  strideW,
-                                  paddingH,
-                                  paddingW,
-                                  groups,
-                                  activation,
-                                  use_bias};
+  Conv2DAttrs attrs = Conv2DAttrs{
+      /*out_channels=*/outChannels,
+      /*kernel_h=*/kernelH,
+      /*kernel_w=*/kernelW,
+      /*stride_h=*/strideH,
+      /*stride_w=*/strideW,
+      /*padding_h=*/paddingH,
+      /*padding_w=*/paddingW,
+      /*groups=*/groups,
+      /*activation=*/activation,
+      /*use_bias=*/use_bias,
+  };
 
   std::string name =
       maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
@@ -451,13 +453,18 @@ tensor_guid_t ComputationGraphBuilder::dropout(
 
 tensor_guid_t ComputationGraphBuilder::embedding(
     tensor_guid_t const &x,
-    int num_entries,
-    int outDim,
+    nonnegative_int num_entries,
+    nonnegative_int outDim,
     AggregateOp aggr,
     DataType dtype,
     std::optional<InitializerAttrs> const &kernel_initializer,
     std::optional<std::string> const &maybe_name) {
-  EmbeddingAttrs attrs = EmbeddingAttrs{num_entries, outDim, aggr, dtype};
+  EmbeddingAttrs attrs = EmbeddingAttrs{
+      /*num_entries=*/num_entries,
+      /*out_channels=*/outDim,
+      /*aggr=*/aggr,
+      /*data_type=*/dtype,
+  };
   std::string name =
       maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
 
@@ -509,12 +516,12 @@ tensor_guid_t ComputationGraphBuilder::gather(
 }
 tensor_guid_t ComputationGraphBuilder::pool2d(
     tensor_guid_t const &x,
-    int kernelH,
-    int kernelW,
-    int strideH,
-    int strideW,
-    int paddingH,
-    int paddingW,
+    nonnegative_int kernelH,
+    nonnegative_int kernelW,
+    nonnegative_int strideH,
+    nonnegative_int strideW,
+    nonnegative_int paddingH,
+    nonnegative_int paddingW,
     PoolOp type,
     std::optional<Activation> const &activation,
     std::optional<std::string> const &maybe_name) {
@@ -547,8 +554,8 @@ tensor_guid_t ComputationGraphBuilder::pool2d(
 
 tensor_guid_t ComputationGraphBuilder::adaptive_pool2d(
     tensor_guid_t const &uncasted_input,
-    int output_h,
-    int output_w,
+    nonnegative_int output_h,
+    nonnegative_int output_w,
     PoolOp type,
     std::optional<Activation> const &activation,
     std::optional<std::string> const &maybe_name) {
@@ -637,10 +644,10 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention(
     tensor_guid_t const &query,
     tensor_guid_t const &key,
     tensor_guid_t const &value,
-    int embed_dim,
-    int num_heads,
-    int kdim,
-    int vdim,
+    nonnegative_int embed_dim,
+    nonnegative_int num_heads,
+    nonnegative_int kdim,
+    nonnegative_int vdim,
     float dropout,
     bool bias,
     bool add_bias_kv,
@@ -662,14 +669,16 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention(
         "If you need this functionality, please create an issue.");
   }
 
-  MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{embed_dim,
-                                                          num_heads,
-                                                          kdim,
-                                                          vdim,
-                                                          dropout,
-                                                          bias,
-                                                          add_bias_kv,
-                                                          add_zero_attn};
+  MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
+      /*embed_dim=*/embed_dim,
+      /*num_heads=*/num_heads,
+      /*kdim=*/kdim,
+      /*vdim=*/vdim,
+      /*dropout=*/dropout,
+      /*bias=*/bias,
+      /*add_bias_kv=*/add_bias_kv,
+      /*add_zero_attn=*/add_zero_attn,
+  };
 
   std::string name =
       maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
@@ -743,7 +752,7 @@ TensorDims ComputationGraphBuilder::get_broadcast_target_dims(
 
 tensor_guid_t ComputationGraphBuilder::dense(
     tensor_guid_t const &input,
-    int outDim,
+    nonnegative_int outDim,
     std::optional<Activation> activation,
     bool use_bias,
     DataType data_type,
@@ -752,8 +761,13 @@ tensor_guid_t ComputationGraphBuilder::dense(
     std::optional<std::string> const &maybe_name,
     std::optional<std::string> const &projection_name,
     std::optional<std::string> const &bias_name) {
-  LinearAttrs attrs =
-      LinearAttrs{outDim, use_bias, data_type, activation, std::nullopt};
+  LinearAttrs attrs = LinearAttrs{
+      /*out_channels=*/outDim,
+      /*use_bias=*/use_bias,
+      /*data_type=*/data_type,
+      /*activation=*/activation,
+      /*regularizer=*/std::nullopt,
+  };
 
   std::string name =
       maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
@@ -794,12 +808,11 @@ tensor_guid_t ComputationGraphBuilder::dense(
 
 tensor_guid_t ComputationGraphBuilder::concat(
     std::vector<tensor_guid_t> const &inputs,
-    int axis,
+    relative_ff_dim_t axis,
     std::optional<std::string> const &maybe_name) {
 
-  relative_ff_dim_t wrapped_axis = relative_ff_dim_t{axis};
   ConcatAttrs attrs = ConcatAttrs{ff_dim_t_from_relative_ff_dim_t(
-      wrapped_axis, num_dims(this->get_shape(inputs[0])))};
+      axis, num_dims(this->get_shape(inputs[0])))};
 
   std::string name =
       maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
@@ -817,17 +830,17 @@ tensor_guid_t ComputationGraphBuilder::concat(
 
 tensor_guid_t ComputationGraphBuilder::flat(
     tensor_guid_t const &input,
-    int start_dim,
-    std::optional<int> const &end_dim,
+    relative_ff_dim_t start_dim,
+    std::optional<relative_ff_dim_t> const &end_dim,
     std::optional<std::string> const &maybe_name) {
-  int input_num_dims = num_dims(this->get_shape(input));
+  nonnegative_int input_num_dims = num_dims(this->get_shape(input));
 
   FlatAttrs attrs = FlatAttrs{
-      /*start_dim=*/ff_dim_t_from_relative_ff_dim_t(
-          relative_ff_dim_t{start_dim}, input_num_dims),
+      /*start_dim=*/ff_dim_t_from_relative_ff_dim_t(start_dim, input_num_dims),
       /*end_dim=*/
-      ff_dim_t_from_relative_ff_dim_t(
-          relative_ff_dim_t{end_dim.value_or(input_num_dims)}, input_num_dims),
+      ff_dim_t_from_relative_ff_dim_t(end_dim.value_or(relative_ff_dim_t{
+                                          input_num_dims.unwrap_nonnegative()}),
+                                      input_num_dims),
   };
 
   std::string name =
@@ -843,16 +856,15 @@ tensor_guid_t ComputationGraphBuilder::flat(
 
 tensor_guid_t ComputationGraphBuilder::layer_norm(
     tensor_guid_t const &input,
-    std::vector<int> const &relative_axes,
+    std::vector<relative_ff_dim_t> const &relative_axes,
     bool elementwise_affine,
     float eps,
     std::optional<std::string> const &maybe_name) {
 
   TensorShape input_shape = this->get_shape(input);
 
-  auto resolve_dim_idx = [&](int dim_idx) {
-    return ff_dim_t_from_relative_ff_dim_t(relative_ff_dim_t{dim_idx},
-                                           num_dims(input_shape));
+  auto resolve_dim_idx = [&](relative_ff_dim_t dim_idx) {
+    return ff_dim_t_from_relative_ff_dim_t(dim_idx, num_dims(input_shape));
   };
 
   stack_vector<ff_dim_t, MAX_TENSOR_DIM> axes = stack_vector_of<MAX_TENSOR_DIM>(
@@ -910,15 +922,16 @@ tensor_guid_t ComputationGraphBuilder::layer_norm(
 
 tensor_guid_t ComputationGraphBuilder::softmax(
     tensor_guid_t const &input,
-    std::optional<int> maybe_dim,
+    std::optional<relative_ff_dim_t> maybe_dim,
     std::optional<std::string> const &maybe_name) {
 
   TensorShape input_shape = this->get_shape(input);
 
-  int dim = maybe_dim.value_or(num_dims(input_shape) - 1);
+  relative_ff_dim_t dim = maybe_dim.value_or(
+      relative_ff_dim_t{num_dims(input_shape).unwrap_nonnegative() - 1});
 
-  SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t_from_relative_ff_dim_t(
-      relative_ff_dim_t{dim}, num_dims(input_shape))};
+  SoftmaxAttrs attrs =
+      SoftmaxAttrs{ff_dim_t_from_relative_ff_dim_t(dim, num_dims(input_shape))};
 
   if (attrs.dim.value >= num_dims(input_shape)) {
     throw mk_runtime_error(
diff --git a/lib/pcg/src/pcg/device_id.cc b/lib/pcg/src/pcg/device_id.cc
index a8cfe1f82f..1a4f7b7d22 100644
--- a/lib/pcg/src/pcg/device_id.cc
+++ b/lib/pcg/src/pcg/device_id.cc
@@ -25,7 +25,7 @@ cpu_id_t unwrap_cpu(device_id_t device_id) {
   return device_id.get<cpu_id_t>();
 }
 
-int get_raw_id(device_id_t device_id) {
+nonnegative_int get_raw_id(device_id_t device_id) {
   switch (get_device_type(device_id)) {
     case DeviceType::GPU:
       return unwrap_gpu(device_id).gpu_index;
@@ -36,7 +36,7 @@ int get_raw_id(device_id_t device_id) {
   }
 }
 
-device_id_t device_id_from_index(int idx, DeviceType device_type) {
+device_id_t device_id_from_index(nonnegative_int idx, DeviceType device_type) {
   switch (device_type) {
     case DeviceType::GPU:
       return device_id_t{gpu_id_t{idx}};
diff --git a/lib/pcg/src/pcg/file_format/v1/graphs/v1_dataflow_graph.cc b/lib/pcg/src/pcg/file_format/v1/graphs/v1_dataflow_graph.cc
index cf150a339f..064e2d81d3 100644
--- a/lib/pcg/src/pcg/file_format/v1/graphs/v1_dataflow_graph.cc
+++ b/lib/pcg/src/pcg/file_format/v1/graphs/v1_dataflow_graph.cc
@@ -10,15 +10,15 @@
 namespace FlexFlow {
 
 V1DataflowGraph to_v1(DataflowGraphView const &g) {
-  bidict<int, Node> node_enumeration_bidict =
+  bidict<nonnegative_int, Node> node_enumeration_bidict =
       bidict_from_enumerating(get_nodes(g));
-  std::unordered_map<Node, int> node_enumeration =
+  std::unordered_map<Node, nonnegative_int> node_enumeration =
       node_enumeration_bidict.reversed().as_unordered_map();
   return to_v1(g, node_enumeration);
 }
 
 V1DataflowGraph to_v1(DataflowGraphView const &g,
-                      std::unordered_map<Node, int> const &nodes) {
+                      std::unordered_map<Node, nonnegative_int> const &nodes) {
   std::unordered_set<V1GraphEdge> edges;
   for (DataflowEdge const &e : get_edges(g)) {
     edges.insert(V1GraphEdge{
diff --git a/lib/pcg/src/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.cc b/lib/pcg/src/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.cc
index d353ccdda3..ac819db342 100644
--- a/lib/pcg/src/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.cc
+++ b/lib/pcg/src/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.cc
@@ -1 +1,17 @@
 #include "pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using NodeLabel = value_type<0>;
+using OutputLabel = value_type<1>;
+
+template std::pair<V1LabelledDataflowGraph<NodeLabel, OutputLabel>,
+                   bidict<nonnegative_int, Node>>
+    to_v1_including_node_numbering(
+        LabelledDataflowGraphView<NodeLabel, OutputLabel> const &);
+
+template V1LabelledDataflowGraph<NodeLabel, OutputLabel>
+    to_v1(LabelledDataflowGraphView<NodeLabel, OutputLabel> const &);
+
+} // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc b/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
index 5341e03c0a..d39652a7e2 100644
--- a/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
+++ b/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
@@ -21,7 +21,7 @@ V1BinarySPDecomposition
     };
   } else if (type == "leaf") {
     return V1BinarySPDecomposition{
-        j.at("value").get<int>(),
+        j.at("value").get<nonnegative_int>(),
     };
   } else {
     throw mk_runtime_error(fmt::format(
@@ -45,7 +45,7 @@ void adl_serializer<V1BinarySPDecomposition>::to_json(
         j["type"] = "parallel";
         return std::monostate{};
       },
-      [&](int leaf) {
+      [&](nonnegative_int leaf) {
         j["value"] = leaf;
         j["type"] = "leaf";
         return std::monostate{};
diff --git a/lib/pcg/src/pcg/file_format/v1/v1_computation_graph.cc b/lib/pcg/src/pcg/file_format/v1/v1_computation_graph.cc
index 975e92dfb7..3511ccc269 100644
--- a/lib/pcg/src/pcg/file_format/v1/v1_computation_graph.cc
+++ b/lib/pcg/src/pcg/file_format/v1/v1_computation_graph.cc
@@ -9,13 +9,14 @@ V1ComputationGraph to_v1(ComputationGraph const &g) {
   };
 }
 
-std::pair<V1ComputationGraph, bidict<int, layer_guid_t>>
+std::pair<V1ComputationGraph, bidict<nonnegative_int, layer_guid_t>>
     to_v1_including_node_numbering(ComputationGraph const &cg) {
-  std::pair<V1LabelledDataflowGraph<LayerAttrs, TensorAttrs>, bidict<int, Node>>
+  std::pair<V1LabelledDataflowGraph<LayerAttrs, TensorAttrs>,
+            bidict<nonnegative_int, Node>>
       raw =
           to_v1_including_node_numbering<LayerAttrs, TensorAttrs>(cg.raw_graph);
   V1ComputationGraph v1_cg = V1ComputationGraph{raw.first};
-  bidict<int, layer_guid_t> v1_node_ids =
+  bidict<nonnegative_int, layer_guid_t> v1_node_ids =
       map_values(raw.second, [](Node const &n) { return layer_guid_t{n}; });
 
   return {v1_cg, v1_node_ids};
diff --git a/lib/pcg/src/pcg/machine_space_offset.cc b/lib/pcg/src/pcg/machine_space_offset.cc
index 9990023f8c..4aa79b3d1b 100644
--- a/lib/pcg/src/pcg/machine_space_offset.cc
+++ b/lib/pcg/src/pcg/machine_space_offset.cc
@@ -17,8 +17,10 @@ MachineSpaceOffset get_machine_space_offset_from_coordinate(
         fmt::format("{} has different DeviceType from {}", start, coord));
   }
 
-  return MachineSpaceOffset{coord.node_idx - start.node_idx,
-                            coord.device_idx - start.device_idx,
+  return MachineSpaceOffset{coord.node_idx.unwrap_nonnegative() -
+                                start.node_idx.unwrap_nonnegative(),
+                            coord.device_idx.unwrap_nonnegative() -
+                                start.device_idx.unwrap_nonnegative(),
                             coord.device_type};
 }
 
diff --git a/lib/pcg/src/pcg/machine_specification.cc b/lib/pcg/src/pcg/machine_specification.cc
index 19ff50b4b7..0fefeddd27 100644
--- a/lib/pcg/src/pcg/machine_specification.cc
+++ b/lib/pcg/src/pcg/machine_specification.cc
@@ -4,14 +4,16 @@
 #include "utils/exception.h"
 namespace FlexFlow {
 
-int get_num_gpus(MachineSpecification const &ms) {
+nonnegative_int get_num_gpus(MachineSpecification const &ms) {
   return ms.num_nodes * ms.num_gpus_per_node;
 }
-int get_num_cpus(MachineSpecification const &ms) {
+
+nonnegative_int get_num_cpus(MachineSpecification const &ms) {
   return ms.num_nodes * ms.num_cpus_per_node;
 }
-int get_num_devices(MachineSpecification const &ms,
-                    DeviceType const &device_type) {
+
+nonnegative_int get_num_devices(MachineSpecification const &ms,
+                                DeviceType const &device_type) {
   switch (device_type) {
     case DeviceType::GPU:
       return get_num_gpus(ms);
@@ -22,8 +24,8 @@ int get_num_devices(MachineSpecification const &ms,
   }
 }
 
-int get_num_devices_per_node(MachineSpecification const &ms,
-                             DeviceType const &device_type) {
+nonnegative_int get_num_devices_per_node(MachineSpecification const &ms,
+                                         DeviceType const &device_type) {
   switch (device_type) {
     case DeviceType::GPU:
       return ms.num_gpus_per_node;
@@ -33,6 +35,7 @@ int get_num_devices_per_node(MachineSpecification const &ms,
       throw mk_runtime_error(fmt::format("Unknown DeviceType {}", device_type));
   }
 }
+
 bool is_valid_machine_space_coordinate(MachineSpecification const &ms,
                                        MachineSpaceCoordinate const &coord) {
   return (coord.node_idx < ms.num_nodes) &&
@@ -45,7 +48,7 @@ device_id_t get_device_id(MachineSpecification const &ms,
     throw mk_runtime_error(fmt::format(
         "Invalid coordinate {} for machine specification {}", ms, coord));
   }
-  int raw_idx =
+  nonnegative_int raw_idx =
       coord.node_idx * get_num_devices_per_node(ms, coord.device_type) +
       coord.device_idx;
   return device_id_from_index(raw_idx, coord.device_type);
diff --git a/lib/pcg/src/pcg/machine_view.cc b/lib/pcg/src/pcg/machine_view.cc
index cc42ad83b2..fe319dc63c 100644
--- a/lib/pcg/src/pcg/machine_view.cc
+++ b/lib/pcg/src/pcg/machine_view.cc
@@ -16,6 +16,9 @@
 #include "utils/containers/transform.h"
 #include "utils/containers/zip.h"
 #include "utils/exception.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/nonnegative_int/num_elements.h"
+
 namespace FlexFlow {
 
 size_t num_dims(MachineView const &mv) {
@@ -71,47 +74,57 @@ std::optional<MachineSpaceCoordinate> get_machine_space_coordinate(
   }
 
   auto get_dimension_indices_for_dimension =
-      [&](MachineSpecificationDimension dimension) {
-        std::vector<MachineSpecificationDimension> mv_dimensions =
-            get_dimensions(machine_view);
-        return filter(count(mv_dimensions.size()), [&](size_t idx) {
-          return mv_dimensions.at(idx) == dimension;
-        });
-      };
-
-  auto compute_index = [&](int start_idx,
-                           std::vector<size_t> const &dimension_indices) {
-    std::vector<stride_t> mv_strides = get_strides(machine_view);
-
-    std::vector<int> sizes = transform(dimension_indices, [&](size_t i) {
-      return task.degrees.at(i) * mv_strides.at(i).unwrapped;
-    });
-    std::vector<int> coord_points = transform(
-        dimension_indices, [&](size_t i) { return coord.raw_coord.at(i); });
-    std::vector<int> strides = transform(dimension_indices, [&](size_t i) {
-      return mv_strides.at(i).unwrapped;
-    });
-
-    std::vector<int> coeffs = scanl(sizes, 1, std::multiplies<int>());
-
-    int index = start_idx;
-    for (auto [coeff, coord_point, stride] :
-         zip(coeffs, coord_points, strides)) {
-      index += coeff * coord_point * stride;
-    }
-    return index;
+      [&](MachineSpecificationDimension dimension)
+      -> std::vector<nonnegative_int> {
+    std::vector<MachineSpecificationDimension> mv_dimensions =
+        get_dimensions(machine_view);
+    return filter(nonnegative_range(num_elements(mv_dimensions)),
+                  [&](nonnegative_int idx) {
+                    return mv_dimensions.at(idx.unwrap_nonnegative()) ==
+                           dimension;
+                  });
   };
 
-  std::vector<size_t> inter_dimension_indices =
+  auto compute_index =
+      [&](nonnegative_int start_idx,
+          std::vector<nonnegative_int> const &dimension_indices) {
+        std::vector<stride_t> mv_strides = get_strides(machine_view);
+
+        std::vector<nonnegative_int> sizes =
+            transform(dimension_indices, [&](nonnegative_int i) {
+              return task.degrees.at(i.unwrap_nonnegative()) *
+                     mv_strides.at(i.unwrap_nonnegative()).unwrapped;
+            });
+        std::vector<nonnegative_int> coord_points =
+            transform(dimension_indices, [&](nonnegative_int i) {
+              return coord.raw_coord.at(i.unwrap_nonnegative());
+            });
+        std::vector<nonnegative_int> strides =
+            transform(dimension_indices, [&](nonnegative_int i) {
+              return mv_strides.at(i.unwrap_nonnegative()).unwrapped;
+            });
+
+        std::vector<nonnegative_int> coeffs = scanl(
+            sizes, nonnegative_int{1}, std::multiplies<nonnegative_int>());
+
+        nonnegative_int index = start_idx;
+        for (auto [coeff, coord_point, stride] :
+             zip(coeffs, coord_points, strides)) {
+          index += coeff * coord_point * stride;
+        }
+        return index;
+      };
+
+  std::vector<nonnegative_int> inter_dimension_indices =
       get_dimension_indices_for_dimension(
           MachineSpecificationDimension::INTER_NODE);
-  std::vector<size_t> intra_dimension_indices =
+  std::vector<nonnegative_int> intra_dimension_indices =
       get_dimension_indices_for_dimension(
           MachineSpecificationDimension::INTRA_NODE);
 
-  int node_idx =
+  nonnegative_int node_idx =
       compute_index(machine_view.start.node_idx, inter_dimension_indices);
-  int device_idx =
+  nonnegative_int device_idx =
       compute_index(machine_view.start.device_idx, intra_dimension_indices);
   MachineSpaceCoordinate ms_coord = MachineSpaceCoordinate{
       node_idx, device_idx, get_device_type(machine_view)};
diff --git a/lib/pcg/src/pcg/operator_task_space.cc b/lib/pcg/src/pcg/operator_task_space.cc
index 7157b75082..57af6eedc7 100644
--- a/lib/pcg/src/pcg/operator_task_space.cc
+++ b/lib/pcg/src/pcg/operator_task_space.cc
@@ -14,18 +14,23 @@
 #include "utils/containers/unordered_set_of.h"
 #include "utils/containers/vector_of.h"
 #include "utils/fmt/unordered_set.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/nonnegative_int/num_elements.h"
+
 namespace FlexFlow {
 
 std::unordered_set<TaskSpaceCoordinate>
     get_task_space_coordinates(OperatorTaskSpace const &task) {
 
-  std::vector<std::vector<int>> coordinate_ranges = transform(
-      task.degrees, [&](int const &num_points) { return range(num_points); });
+  std::vector<std::vector<nonnegative_int>> coordinate_ranges =
+      transform(task.degrees, [&](nonnegative_int num_points) {
+        return nonnegative_range(num_points);
+      });
 
-  std::unordered_set<std::vector<int>> raw_coordinates =
+  std::unordered_set<std::vector<nonnegative_int>> raw_coordinates =
       unordered_set_of(cartesian_product(coordinate_ranges));
   std::unordered_set<TaskSpaceCoordinate> task_space_coordinates =
-      transform(raw_coordinates, [](std::vector<int> const &point) {
+      transform(raw_coordinates, [](std::vector<nonnegative_int> const &point) {
         return TaskSpaceCoordinate{point};
       });
   return task_space_coordinates;
@@ -36,10 +41,11 @@ TaskSpaceCoordinate
   return maximum(get_task_space_coordinates(task));
 }
 
-size_t num_dims(OperatorTaskSpace const &task) {
-  return task.degrees.size();
+nonnegative_int num_dims(OperatorTaskSpace const &task) {
+  return num_elements(task.degrees);
 }
-size_t num_tasks(OperatorTaskSpace const &task) {
+
+nonnegative_int num_tasks(OperatorTaskSpace const &task) {
   return product(task.degrees);
 }
 
@@ -48,7 +54,7 @@ OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
   parallel_tensor_guid_t out_tensor = get_layer_outputs(pcg, layer).at(0);
   ParallelTensorShape shape = get_parallel_tensor_shape(pcg, out_tensor);
 
-  std::vector<int> degrees;
+  std::vector<nonnegative_int> degrees;
   extend(degrees, vector_of(ff_ordered_shard_degrees(shape)));
   degrees.push_back(get_sum_degree(shape));
   degrees.push_back(get_discard_copy_degree(shape));
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
index dadad6277f..2cf149f78a 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
@@ -9,7 +9,7 @@ std::unordered_set<ParallelOpAttrs>
                               ParallelTensorShape const &goal) {
   std::unordered_set<ParallelOpAttrs> result;
 
-  int sum_degree = get_sum_degree(goal);
+  nonnegative_int sum_degree = get_sum_degree(goal);
   if (sum_degree != 1) {
     throw mk_runtime_error(
         fmt::format("generate_weight_transform currently only supports "
@@ -17,7 +17,7 @@ std::unordered_set<ParallelOpAttrs>
                     sum_degree));
   }
 
-  int discard_copy_degree = get_discard_copy_degree(goal);
+  nonnegative_int discard_copy_degree = get_discard_copy_degree(goal);
   if (discard_copy_degree != 1) {
     result.insert(ParallelOpAttrs{ReplicateAttrs{discard_copy_degree}});
   }
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
index e2f4555328..16896347e0 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
@@ -108,8 +108,8 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::batch_matmul(
     std::optional<std::string> const &maybe_name) {
 
   BatchMatmulAttrs attrs = BatchMatmulAttrs{
-      /*a_seq_length_dim=*/-1,
-      /*b_seq_length_dim=*/-1,
+      /*a_seq_length_dim=*/std::nullopt,
+      /*b_seq_length_dim=*/std::nullopt,
   };
 
   std::string name =
@@ -141,30 +141,32 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::cast(
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::conv2d(
     parallel_tensor_guid_t const &raw_input,
-    int outChannels,
-    int kernelH,
-    int kernelW,
-    int strideH,
-    int strideW,
-    int paddingH,
-    int paddingW,
+    nonnegative_int outChannels,
+    nonnegative_int kernelH,
+    nonnegative_int kernelW,
+    nonnegative_int strideH,
+    nonnegative_int strideW,
+    nonnegative_int paddingH,
+    nonnegative_int paddingW,
     std::optional<Activation> const &activation,
-    int groups,
+    nonnegative_int groups,
     bool use_bias,
     std::optional<InitializerAttrs> const &kernel_initializer,
     std::optional<InitializerAttrs> const &bias_initializer,
     std::optional<RegularizerAttrs> const &kernel_regularizer,
     std::optional<std::string> const &maybe_name) {
-  Conv2DAttrs attrs = Conv2DAttrs{outChannels,
-                                  kernelH,
-                                  kernelW,
-                                  strideH,
-                                  strideW,
-                                  paddingH,
-                                  paddingW,
-                                  groups,
-                                  activation,
-                                  use_bias};
+  Conv2DAttrs attrs = Conv2DAttrs{
+      /*out_channels=*/outChannels,
+      /*kernel_h=*/kernelH,
+      /*kernel_w=*/kernelW,
+      /*stride_h=*/strideH,
+      /*stride_w=*/strideW,
+      /*padding_h=*/paddingH,
+      /*padding_w=*/paddingW,
+      /*groups=*/groups,
+      /*activation=*/activation,
+      /*use_bias=*/use_bias,
+  };
 
   std::string name =
       maybe_name.value_or(get_default_name(PCGOperatorAttrs{attrs}));
@@ -192,7 +194,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::conv2d(
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::dense(
     parallel_tensor_guid_t const &input,
-    int outDim,
+    nonnegative_int outDim,
     std::optional<Activation> activation,
     bool use_bias,
     DataType data_type,
@@ -200,11 +202,11 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::dense(
     std::optional<InitializerAttrs> const &bias_initializer,
     std::optional<std::string> const &maybe_name) {
   LinearAttrs attrs = LinearAttrs{
-      outDim,
-      use_bias,
-      data_type,
-      activation,
-      std::nullopt,
+      /*out_channels=*/outDim,
+      /*use_bias=*/use_bias,
+      /*data_type=*/data_type,
+      /*activation=*/activation,
+      /*regularizer=*/std::nullopt,
   };
 
   std::string name =
@@ -239,18 +241,18 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::dense(
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::embedding(
     parallel_tensor_guid_t const &input,
-    int num_entries,
-    int outDim,
+    nonnegative_int num_entries,
+    nonnegative_int outDim,
     AggregateOp aggr,
     DataType dtype,
     std::optional<InitializerAttrs> const &kernel_initializer,
     std::optional<std::string> const &maybe_name) {
 
   EmbeddingAttrs attrs = EmbeddingAttrs{
-      num_entries,
-      outDim,
-      aggr,
-      dtype,
+      /*num_entries=*/num_entries,
+      /*out_channels=*/outDim,
+      /*aggr=*/aggr,
+      /*data_type=*/dtype,
   };
 
   std::string name =
@@ -274,10 +276,10 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::multihead_attention(
     parallel_tensor_guid_t const &query,
     parallel_tensor_guid_t const &key,
     parallel_tensor_guid_t const &value,
-    int embed_dim,
-    int num_heads,
-    std::optional<int> maybe_kdim,
-    std::optional<int> maybe_vdim,
+    nonnegative_int embed_dim,
+    nonnegative_int num_heads,
+    std::optional<nonnegative_int> maybe_kdim,
+    std::optional<nonnegative_int> maybe_vdim,
     float dropout,
     bool bias,
     bool add_bias_kv,
@@ -287,8 +289,8 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::multihead_attention(
     std::optional<InitializerAttrs> output_bias_initializer,
     std::optional<std::string> const &maybe_name) {
 
-  int kdim = maybe_kdim.value_or(embed_dim);
-  int vdim = maybe_vdim.value_or(embed_dim);
+  nonnegative_int kdim = maybe_kdim.value_or(embed_dim);
+  nonnegative_int vdim = maybe_vdim.value_or(embed_dim);
 
   MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
       /*embed_dim=*/embed_dim,
@@ -491,10 +493,13 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::elu(
 parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_partition(
     parallel_tensor_guid_t const &input,
     ff_dim_t dim,
-    int degree,
+    nonnegative_int degree,
     std::optional<std::string> const &maybe_name) {
 
-  RepartitionAttrs attrs = RepartitionAttrs{dim, degree};
+  RepartitionAttrs attrs = RepartitionAttrs{
+      /*repartition_dim=*/dim,
+      /*repartition_degree=*/degree,
+  };
 
   std::string name =
       maybe_name.value_or(get_default_name(PCGOperatorAttrs{attrs}));
@@ -510,10 +515,13 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_partition(
 parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_combine(
     parallel_tensor_guid_t const &input,
     ff_dim_t dim,
-    int degree,
+    nonnegative_int degree,
     std::optional<std::string> const &maybe_name) {
 
-  CombineAttrs attrs = CombineAttrs{dim, degree};
+  CombineAttrs attrs = CombineAttrs{
+      /*combine_dim=*/dim,
+      /*combine_degree=*/degree,
+  };
 
   std::string name =
       maybe_name.value_or(get_default_name(PCGOperatorAttrs{attrs}));
@@ -528,7 +536,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_combine(
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_replicate(
     parallel_tensor_guid_t const &input,
-    int degree,
+    nonnegative_int degree,
     std::optional<std::string> const &maybe_name) {
 
   ReplicateAttrs attrs = ReplicateAttrs{degree};
@@ -546,7 +554,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_replicate(
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_reduce(
     parallel_tensor_guid_t const &input,
-    int degree,
+    nonnegative_int degree,
     std::optional<std::string> const &maybe_name) {
 
   ReductionAttrs attrs = ReductionAttrs{degree};
@@ -662,7 +670,7 @@ std::vector<parallel_tensor_guid_t> ParallelComputationGraphBuilder::add_layer(
 
   std::vector<DataflowOutput> raw_weight_tensors;
   for (auto const &kv : enumerate_vector(weights)) {
-    int weight_idx = kv.first;
+    nonnegative_int weight_idx = kv.first;
     ParallelTensorAttrs weight_tensor_attrs = kv.second;
 
     std::optional<std::string> weight_name =
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc
index d30739486e..f37d08dc8a 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc
@@ -16,7 +16,7 @@ parallel_layer_guid_t get_dst_layer(ParallelComputationGraphEdge const &e) {
   return parallel_layer_guid_t{e.raw_edge.dst.node};
 }
 
-int get_dst_layer_input_idx(ParallelComputationGraphEdge const &e) {
+nonnegative_int get_dst_layer_input_idx(ParallelComputationGraphEdge const &e) {
   return e.raw_edge.dst.idx;
 }
 
diff --git a/lib/pcg/src/pcg/start_invariant_machine_view.cc b/lib/pcg/src/pcg/start_invariant_machine_view.cc
index 1fcc3ea12f..e9f864d416 100644
--- a/lib/pcg/src/pcg/start_invariant_machine_view.cc
+++ b/lib/pcg/src/pcg/start_invariant_machine_view.cc
@@ -7,6 +7,7 @@
 #include "utils/containers/scanl.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/zip.h"
+#include "utils/nonnegative_int/num_elements.h"
 namespace FlexFlow {
 
 MachineView machine_view_from_start_invariant(
@@ -20,8 +21,8 @@ StartInvariantMachineView
   return StartInvariantMachineView{mv.dimensions, get_device_type(mv)};
 }
 
-size_t num_dims(StartInvariantMachineView const &start_inv_mv) {
-  return start_inv_mv.dimensions.size();
+nonnegative_int num_dims(StartInvariantMachineView const &start_inv_mv) {
+  return num_elements(start_inv_mv.dimensions);
 }
 
 DeviceType get_device_type(StartInvariantMachineView const &start_inv_mv) {
@@ -59,7 +60,7 @@ std::optional<MachineSpaceOffset> get_machine_space_offset(
     TaskSpaceCoordinate const &coord,
     MachineSpecification const &machine_specification) {
   MachineSpaceCoordinate dummy_start =
-      MachineSpaceCoordinate{0, 0, get_device_type(start_inv_machine_view)};
+      MachineSpaceCoordinate{0_n, 0_n, get_device_type(start_inv_machine_view)};
   MachineView mv =
       machine_view_from_start_invariant(start_inv_machine_view, dummy_start);
   std::optional<MachineSpaceCoordinate> ms_coord =
diff --git a/lib/pcg/test/src/pcg/computation_graph.cc b/lib/pcg/test/src/pcg/computation_graph.cc
index e2ed51b2f1..d92d65ad7b 100644
--- a/lib/pcg/test/src/pcg/computation_graph.cc
+++ b/lib/pcg/test/src/pcg/computation_graph.cc
@@ -13,9 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<size_t>{
-                10,
-                12,
+            TensorDims{FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
             }},
             DataType::FLOAT,
         };
@@ -40,9 +40,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraphBuilder b;
 
       TensorShape input_shape = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              10,
-              12,
+          TensorDims{FFOrdered<nonnegative_int>{
+              10_n,
+              12_n,
           }},
           DataType::FLOAT,
       };
@@ -66,16 +66,16 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraphBuilder b;
 
       TensorShape input_shape = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              10,
-              12,
+          TensorDims{FFOrdered<nonnegative_int>{
+              10_n,
+              12_n,
           }},
           DataType::FLOAT,
       };
 
       tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES);
       b.dense(input,
-              /*outDim=*/14,
+              /*outDim=*/14_n,
               /*activation=*/Activation::RELU,
               /*use_bias=*/true,
               /*data_type=*/DataType::FLOAT,
@@ -103,9 +103,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<size_t>{
-                10,
-                12,
+            TensorDims{FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
             }},
             DataType::FLOAT,
         };
@@ -131,9 +131,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<size_t>{
-                10,
-                12,
+            TensorDims{FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
             }},
             DataType::FLOAT,
         };
@@ -161,16 +161,16 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<size_t>{
-                10,
-                12,
+            TensorDims{FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
             }},
             DataType::FLOAT,
         };
 
         tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES);
         b.dense(input,
-                /*outDim=*/14,
+                /*outDim=*/14_n,
                 /*activation=*/Activation::RELU,
                 /*use_bias=*/true,
                 /*data_type=*/DataType::FLOAT,
diff --git a/lib/pcg/test/src/pcg/computation_graph_builder.cc b/lib/pcg/test/src/pcg/computation_graph_builder.cc
index e7fa853be9..98a4e2a241 100644
--- a/lib/pcg/test/src/pcg/computation_graph_builder.cc
+++ b/lib/pcg/test/src/pcg/computation_graph_builder.cc
@@ -8,22 +8,22 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ComputationGraphBuilder") {
     ComputationGraphBuilder b;
 
-    size_t batch_size = 2;
+    nonnegative_int batch_size = 2_n;
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{batch_size, 3, 10, 10}},
+        TensorDims{FFOrdered<nonnegative_int>{batch_size, 3_n, 10_n, 10_n}},
         DataType::FLOAT,
     };
 
     tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES);
     tensor_guid_t output = b.conv2d(input,
-                                    /*outChannels=*/5,
-                                    /*kernelH=*/3,
-                                    /*kernelW=*/3,
-                                    /*strideH=*/1,
-                                    /*strideW=*/1,
-                                    /*paddingH=*/0,
-                                    /*paddingW=*/0);
+                                    /*outChannels=*/5_n,
+                                    /*kernelH=*/3_n,
+                                    /*kernelW=*/3_n,
+                                    /*strideH=*/1_n,
+                                    /*strideW=*/1_n,
+                                    /*paddingH=*/0_n,
+                                    /*paddingW=*/0_n);
     // ComputationGraph cg = b.computation_graph;
     // CHECK(get_layers(cg).size() == 1);
   }
diff --git a/lib/pcg/test/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc b/lib/pcg/test/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
index 9068e14517..4102efd48e 100644
--- a/lib/pcg/test/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
+++ b/lib/pcg/test/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
@@ -9,11 +9,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         V1BinarySeriesSplit{
             V1BinarySPDecomposition{
                 V1BinaryParallelSplit{
-                    V1BinarySPDecomposition{2},
-                    V1BinarySPDecomposition{2},
+                    V1BinarySPDecomposition{2_n},
+                    V1BinarySPDecomposition{2_n},
                 },
             },
-            V1BinarySPDecomposition{3},
+            V1BinarySPDecomposition{3_n},
         },
     };
 
@@ -68,11 +68,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     V1BinarySeriesSplit example_split = V1BinarySeriesSplit{
         V1BinarySPDecomposition{
             V1BinaryParallelSplit{
-                V1BinarySPDecomposition{2},
-                V1BinarySPDecomposition{2},
+                V1BinarySPDecomposition{2_n},
+                V1BinarySPDecomposition{2_n},
             },
         },
-        V1BinarySPDecomposition{3},
+        V1BinarySPDecomposition{3_n},
     };
 
     nlohmann::json example_json = {
@@ -124,11 +124,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     V1BinaryParallelSplit example_split = V1BinaryParallelSplit{
         V1BinarySPDecomposition{
             V1BinaryParallelSplit{
-                V1BinarySPDecomposition{2},
-                V1BinarySPDecomposition{2},
+                V1BinarySPDecomposition{2_n},
+                V1BinarySPDecomposition{2_n},
             },
         },
-        V1BinarySPDecomposition{3},
+        V1BinarySPDecomposition{3_n},
     };
 
     nlohmann::json example_json = {
diff --git a/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc b/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc
index 8336d81bb4..59c606adb1 100644
--- a/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc
+++ b/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc
@@ -10,15 +10,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraphBuilder b;
 
       TensorShape input_shape = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              12,
-              16,
+          TensorDims{FFOrdered<nonnegative_int>{
+              12_n,
+              16_n,
           }},
           DataType::FLOAT,
       };
 
       tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES);
-      tensor_guid_t mm_output = b.dense(input, 8);
+      tensor_guid_t mm_output = b.dense(input, 8_n);
       tensor_guid_t relu_output = b.relu(mm_output);
 
       return b.computation_graph;
diff --git a/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc b/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc
index 8ce25c4bc5..682cf2d798 100644
--- a/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc
+++ b/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc
@@ -12,19 +12,19 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelTensorShape input_shape = ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
-                  ShardParallelDim{12, 2},
-                  ShardParallelDim{16, 1},
+                  ShardParallelDim{12_n, 2_n},
+                  ShardParallelDim{16_n, 1_n},
               },
               ReplicaParallelDimSet{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
               },
           },
           DataType::FLOAT,
       };
 
       parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
-      parallel_tensor_guid_t mm_output = b.dense(input, 8);
+      parallel_tensor_guid_t mm_output = b.dense(input, 8_n);
       parallel_tensor_guid_t relu_output = b.relu(mm_output);
 
       return b.pcg;
diff --git a/lib/pcg/test/src/pcg/machine_specification.cc b/lib/pcg/test/src/pcg/machine_specification.cc
index c183ae0d31..6d339350a0 100644
--- a/lib/pcg/test/src/pcg/machine_specification.cc
+++ b/lib/pcg/test/src/pcg/machine_specification.cc
@@ -7,11 +7,10 @@ using namespace FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("MachineSpecification") {
-
     MachineSpecification ms = MachineSpecification{
-        /*num_nodes=*/4,
-        /*num_cpus_per_node=*/16,
-        /*num_gpus_per_node=*/8,
+        /*num_nodes=*/4_n,
+        /*num_cpus_per_node=*/16_n,
+        /*num_gpus_per_node=*/8_n,
         /*inter_node_bandwidth=*/0,
         /*intra_node_bandwidth=*/0,
     };
@@ -32,19 +31,19 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("get_device_id") {
       SUBCASE("valid MachineSpaceCoordinate") {
         MachineSpaceCoordinate coord = MachineSpaceCoordinate{
-            /*node_idx=*/2,
-            /*device_idx=*/12,
+            /*node_idx=*/2_n,
+            /*device_idx=*/12_n,
             DeviceType::CPU,
         };
         device_id_t correct =
-            device_id_from_index(2 * 16 + 12, DeviceType::CPU);
+            device_id_from_index(nonnegative_int{2 * 16 + 12}, DeviceType::CPU);
         device_id_t result = get_device_id(ms, coord);
         CHECK(correct == result);
       }
       SUBCASE("MachineSpaceCoordinate out of bounds for given machine spec") {
         MachineSpaceCoordinate coord = MachineSpaceCoordinate{
-            /*node_idx=*/2,
-            /*device_idx=*/18,
+            /*node_idx=*/2_n,
+            /*device_idx=*/18_n,
             DeviceType::CPU,
         };
         CHECK_THROWS(get_device_id(ms, coord));
diff --git a/lib/pcg/test/src/pcg/machine_view.cc b/lib/pcg/test/src/pcg/machine_view.cc
index 3e9d48fac3..e286f08bf2 100644
--- a/lib/pcg/test/src/pcg/machine_view.cc
+++ b/lib/pcg/test/src/pcg/machine_view.cc
@@ -12,10 +12,10 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("MachineView - utility functions") {
     MachineView mv = MachineView{
         MachineSpaceCoordinate{
-            /*node_idx=*/0, /*device_idx=*/0, DeviceType::GPU},
-        {MachineViewDimension{stride_t{2},
+            /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
+        {MachineViewDimension{stride_t{2_n},
                               MachineSpecificationDimension::INTER_NODE},
-         MachineViewDimension{stride_t{2},
+         MachineViewDimension{stride_t{2_n},
                               MachineSpecificationDimension::INTER_NODE}}};
 
     SUBCASE("num_dims") {
@@ -43,48 +43,48 @@ TEST_SUITE(FF_TEST_SUITE) {
        * Where the (x,) are the `TaskSpaceCoordinate`s, and the underlying grid
        * is the machine space.
        */
-      OperatorTaskSpace task = OperatorTaskSpace{{3}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
       MachineView mv = MachineView{
           MachineSpaceCoordinate{
-              /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU},
-          {MachineViewDimension{stride_t{2},
+              /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
+          {MachineViewDimension{stride_t{2_n},
                                 MachineSpecificationDimension::INTRA_NODE}}};
       MachineSpecification ms =
-          MachineSpecification{/*num_nodes=*/1,
-                               /*num_cpus_per_node=*/6,
-                               /*num_gpus_per_node=*/6,
+          MachineSpecification{/*num_nodes=*/1_n,
+                               /*num_cpus_per_node=*/6_n,
+                               /*num_gpus_per_node=*/6_n,
                                /*inter_node_bandwidth=*/0,
                                /*intra_node_bandwidth=*/0};
 
       SUBCASE("Task with TaskSpaceCoordinate = (0,)") {
-        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0}};
+        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n}};
         MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-            /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU};
+            /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU};
         MachineSpaceCoordinate result =
             get_machine_space_coordinate(task, mv, coord, ms).value();
         CHECK(correct == result);
       }
 
       SUBCASE("Task with TaskSpaceCoordinate = (1,)") {
-        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1}};
+        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n}};
         MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-            /*node_idx=*/0, /*device_idx=*/3, DeviceType::GPU};
+            /*node_idx=*/0_n, /*device_idx=*/3_n, DeviceType::GPU};
         MachineSpaceCoordinate result =
             get_machine_space_coordinate(task, mv, coord, ms).value();
         CHECK(correct == result);
       }
 
       SUBCASE("Task with TaskSpaceCoordinate = (2,)") {
-        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{2}};
+        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{2_n}};
         MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-            /*node_idx=*/0, /*device_idx=*/5, DeviceType::GPU};
+            /*node_idx=*/0_n, /*device_idx=*/5_n, DeviceType::GPU};
         MachineSpaceCoordinate result =
             get_machine_space_coordinate(task, mv, coord, ms).value();
         CHECK(correct == result);
       }
 
       SUBCASE("TaskSpaceCoordinate is out of bounds") {
-        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{4}};
+        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{4_n}};
         std::optional<MachineSpaceCoordinate> result =
             get_machine_space_coordinate(task, mv, coord, ms);
         std::optional<MachineSpaceCoordinate> correct = std::nullopt;
@@ -112,52 +112,52 @@ TEST_SUITE(FF_TEST_SUITE) {
          * grid is the machine space.
          */
 
-        OperatorTaskSpace task = OperatorTaskSpace{{2, 2}};
+        OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}};
         MachineView mv = MachineView{
             MachineSpaceCoordinate{
-                /*node_idx=*/1, /*device_idx=*/2, DeviceType::GPU},
-            {MachineViewDimension{stride_t{1},
+                /*node_idx=*/1_n, /*device_idx=*/2_n, DeviceType::GPU},
+            {MachineViewDimension{stride_t{1_n},
                                   MachineSpecificationDimension::INTER_NODE},
-             MachineViewDimension{stride_t{2},
+             MachineViewDimension{stride_t{2_n},
                                   MachineSpecificationDimension::INTRA_NODE}}};
         MachineSpecification ms =
-            MachineSpecification{/*num_nodes=*/3,
-                                 /*num_cpus_per_node=*/5,
-                                 /*num_gpus_per_node=*/5,
+            MachineSpecification{/*num_nodes=*/3_n,
+                                 /*num_cpus_per_node=*/5_n,
+                                 /*num_gpus_per_node=*/5_n,
                                  /*inter_node_bandwidth=*/0,
                                  /*intra_node_bandwidth=*/0};
 
         SUBCASE("Task with TaskSpaceCoordinate = (0,0)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 0_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/2, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/2_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (0,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 1_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/4, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/4_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,0)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 0_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/2, /*device_idx=*/2, DeviceType::GPU};
+              /*node_idx=*/2_n, /*device_idx=*/2_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 1_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/2, /*device_idx=*/4, DeviceType::GPU};
+              /*node_idx=*/2_n, /*device_idx=*/4_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
@@ -179,52 +179,52 @@ TEST_SUITE(FF_TEST_SUITE) {
          * grid is the machine space.
          */
 
-        OperatorTaskSpace task = OperatorTaskSpace{{2, 2}};
+        OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}};
         MachineView mv = MachineView{
             MachineSpaceCoordinate{
-                /*node_idx=*/1, /*device_idx=*/0, DeviceType::GPU},
-            {MachineViewDimension{stride_t{1},
+                /*node_idx=*/1_n, /*device_idx=*/0_n, DeviceType::GPU},
+            {MachineViewDimension{stride_t{1_n},
                                   MachineSpecificationDimension::INTRA_NODE},
-             MachineViewDimension{stride_t{2},
+             MachineViewDimension{stride_t{2_n},
                                   MachineSpecificationDimension::INTRA_NODE}}};
         MachineSpecification ms =
-            MachineSpecification{/*num_nodes=*/2,
-                                 /*num_cpus_per_node=*/6,
-                                 /*num_gpus_per_node=*/6,
+            MachineSpecification{/*num_nodes=*/2_n,
+                                 /*num_cpus_per_node=*/6_n,
+                                 /*num_gpus_per_node=*/6_n,
                                  /*inter_node_bandwidth=*/0,
                                  /*intra_node_bandwidth=*/0};
 
         SUBCASE("Task with TaskSpaceCoordinate = (0,0)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 0_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/0, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/0_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (0,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 1_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/4, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/4_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,0)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 0_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/1, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/1_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 1_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/5, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/5_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
@@ -253,45 +253,45 @@ TEST_SUITE(FF_TEST_SUITE) {
          * grid is the machine space.
          */
 
-        OperatorTaskSpace task = OperatorTaskSpace{{2, 2, 2}};
+        OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n, 2_n}};
         MachineView mv = MachineView{
             MachineSpaceCoordinate{
-                /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU},
-            {MachineViewDimension{stride_t{1},
+                /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
+            {MachineViewDimension{stride_t{1_n},
                                   MachineSpecificationDimension::INTER_NODE},
-             MachineViewDimension{stride_t{2},
+             MachineViewDimension{stride_t{2_n},
                                   MachineSpecificationDimension::INTRA_NODE},
-             MachineViewDimension{stride_t{1},
+             MachineViewDimension{stride_t{1_n},
                                   MachineSpecificationDimension::INTRA_NODE}}};
         MachineSpecification ms =
-            MachineSpecification{/*num_nodes=*/2,
-                                 /*num_cpus_per_node=*/8,
-                                 /*num_gpus_per_node=*/8,
+            MachineSpecification{/*num_nodes=*/2_n,
+                                 /*num_cpus_per_node=*/8_n,
+                                 /*num_gpus_per_node=*/8_n,
                                  /*inter_node_bandwidth=*/0,
                                  /*intra_node_bandwidth=*/0};
 
         SUBCASE("Task with TaskSpaceCoordinate = (0,0,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 1, 0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 1_n, 0_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/0, /*device_idx=*/3, DeviceType::GPU};
+              /*node_idx=*/0_n, /*device_idx=*/3_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,1,0)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 0, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 0_n, 1_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/5, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/5_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,1,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 1, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 1_n, 1_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/7, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/7_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
@@ -319,23 +319,23 @@ TEST_SUITE(FF_TEST_SUITE) {
        * select
        */
       MachineSpecification ms =
-          MachineSpecification{/*num_nodes=*/1,
-                               /*num_cpus_per_node=*/6,
-                               /*num_gpus_per_node=*/6,
+          MachineSpecification{/*num_nodes=*/1_n,
+                               /*num_cpus_per_node=*/6_n,
+                               /*num_gpus_per_node=*/6_n,
                                /*inter_node_bandwidth=*/0,
                                /*intra_node_bandwidth=*/0};
 
-      OperatorTaskSpace task = OperatorTaskSpace{{3}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
       MachineView mv = MachineView{
           MachineSpaceCoordinate{
-              /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU},
-          {MachineViewDimension{stride_t{2},
+              /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
+          {MachineViewDimension{stride_t{2_n},
                                 MachineSpecificationDimension::INTRA_NODE}}};
 
       std::unordered_set<device_id_t> correct = {
-          device_id_t{gpu_id_t{1}},
-          device_id_t{gpu_id_t{3}},
-          device_id_t{gpu_id_t{5}},
+          device_id_t{gpu_id_t{1_n}},
+          device_id_t{gpu_id_t{3_n}},
+          device_id_t{gpu_id_t{5_n}},
       };
       std::unordered_set<device_id_t> result = get_device_ids(task, mv, ms);
       CHECK(result == correct);
@@ -364,26 +364,26 @@ TEST_SUITE(FF_TEST_SUITE) {
        */
 
       MachineSpecification ms =
-          MachineSpecification{/*num_nodes=*/3,
-                               /*num_cpus_per_node=*/5,
-                               /*num_gpus_per_node=*/5,
+          MachineSpecification{/*num_nodes=*/3_n,
+                               /*num_cpus_per_node=*/5_n,
+                               /*num_gpus_per_node=*/5_n,
                                /*inter_node_bandwidth=*/0,
                                /*intra_node_bandwidth=*/0};
 
-      OperatorTaskSpace task = OperatorTaskSpace{{2, 2}};
+      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}};
       MachineView mv = MachineView{
           MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/2, DeviceType::GPU},
-          {MachineViewDimension{stride_t{1},
+              /*node_idx=*/1_n, /*device_idx=*/2_n, DeviceType::GPU},
+          {MachineViewDimension{stride_t{1_n},
                                 MachineSpecificationDimension::INTER_NODE},
-           MachineViewDimension{stride_t{2},
+           MachineViewDimension{stride_t{2_n},
                                 MachineSpecificationDimension::INTRA_NODE}}};
 
       std::unordered_set<device_id_t> correct = {
-          device_id_t{gpu_id_t{7}},
-          device_id_t{gpu_id_t{9}},
-          device_id_t{gpu_id_t{12}},
-          device_id_t{gpu_id_t{14}},
+          device_id_t{gpu_id_t{7_n}},
+          device_id_t{gpu_id_t{9_n}},
+          device_id_t{gpu_id_t{12_n}},
+          device_id_t{gpu_id_t{14_n}},
       };
       std::unordered_set<device_id_t> result = get_device_ids(task, mv, ms);
       CHECK(result == correct);
diff --git a/lib/pcg/test/src/pcg/operator_task_space.cc b/lib/pcg/test/src/pcg/operator_task_space.cc
index 13198d9456..fa06af3635 100644
--- a/lib/pcg/test/src/pcg/operator_task_space.cc
+++ b/lib/pcg/test/src/pcg/operator_task_space.cc
@@ -18,13 +18,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
     SUBCASE("OperatorTaskSpace has 2 dimensions") {
 
-      OperatorTaskSpace task = OperatorTaskSpace{{2, 2}};
+      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}};
 
       std::unordered_set<TaskSpaceCoordinate> correct = {{
-          TaskSpaceCoordinate{{0, 0}},
-          TaskSpaceCoordinate{{0, 1}},
-          TaskSpaceCoordinate{{1, 0}},
-          TaskSpaceCoordinate{{1, 1}},
+          TaskSpaceCoordinate{{0_n, 0_n}},
+          TaskSpaceCoordinate{{0_n, 1_n}},
+          TaskSpaceCoordinate{{1_n, 0_n}},
+          TaskSpaceCoordinate{{1_n, 1_n}},
       }};
       std::unordered_set<TaskSpaceCoordinate> result =
           get_task_space_coordinates(task);
@@ -32,13 +32,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
     SUBCASE("OperatorTaskSpace has 3 dimensions") {
 
-      OperatorTaskSpace task = OperatorTaskSpace{{1, 2, 2}};
+      OperatorTaskSpace task = OperatorTaskSpace{{1_n, 2_n, 2_n}};
 
       std::unordered_set<TaskSpaceCoordinate> correct = {{
-          TaskSpaceCoordinate{{0, 0, 0}},
-          TaskSpaceCoordinate{{0, 0, 1}},
-          TaskSpaceCoordinate{{0, 1, 0}},
-          TaskSpaceCoordinate{{0, 1, 1}},
+          TaskSpaceCoordinate{{0_n, 0_n, 0_n}},
+          TaskSpaceCoordinate{{0_n, 0_n, 1_n}},
+          TaskSpaceCoordinate{{0_n, 1_n, 0_n}},
+          TaskSpaceCoordinate{{0_n, 1_n, 1_n}},
       }};
       std::unordered_set<TaskSpaceCoordinate> result =
           get_task_space_coordinates(task);
@@ -48,17 +48,17 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_task_space_maximum_coordinate") {
     SUBCASE("OperatorTaskSpace has 2 dimensions") {
 
-      OperatorTaskSpace task = OperatorTaskSpace{{3, 2}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_n, 2_n}};
 
-      TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2, 1}};
+      TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2_n, 1_n}};
       TaskSpaceCoordinate result = get_task_space_maximum_coordinate(task);
       CHECK(correct == result);
     }
     SUBCASE("OperatorTaskSpace has 3 dimensions") {
 
-      OperatorTaskSpace task = OperatorTaskSpace{{3, 2, 4}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_n, 2_n, 4_n}};
 
-      TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2, 1, 3}};
+      TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2_n, 1_n, 3_n}};
       TaskSpaceCoordinate result = get_task_space_maximum_coordinate(task);
       CHECK(correct == result);
     }
diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index dd8308561f..979a96d204 100644
--- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -45,12 +45,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{12, 1},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{12_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_tensor_guid_t input =
           b.create_input_tensor(input_shape, CreateGrad::YES);
       b.dense(input,
-              /*outDim=*/14,
+              /*outDim=*/14_n,
               /*activation=*/Activation::RELU,
               /*use_bias=*/true,
               /*data_type=*/DataType::FLOAT,
@@ -110,12 +110,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape tensor_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{12, 1},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{12_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -186,12 +186,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{12, 1},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{12_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -246,7 +246,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
       LinearAttrs op_attrs = LinearAttrs{
-          /*out_channels=*/14,
+          /*out_channels=*/14_n,
           /*use_bias=*/false,
           /*data_type=*/DataType::FLOAT,
           /*activation=*/Activation::RELU,
@@ -293,7 +293,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                {},
                                {raw_projection_tensor_attrs});
 
-        ReplicateAttrs replicate_attrs = ReplicateAttrs{/*degree=*/2};
+        ReplicateAttrs replicate_attrs = ReplicateAttrs{/*degree=*/2_n};
         ParallelLayerAttrs replicate_layer_attrs = ParallelLayerAttrs{
             PCGOperatorAttrs{replicate_attrs},
             std::nullopt,
@@ -346,12 +346,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape tensor_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{12, 2},
-                ShardParallelDim{10, 1},
+                ShardParallelDim{12_n, 2_n},
+                ShardParallelDim{10_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{2},
-                DiscardCopyDegree{2},
+                SumDegree{2_n},
+                DiscardCopyDegree{2_n},
             },
         },
         DataType::FLOAT,
diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
index 3f66b33b6e..ef3173d744 100644
--- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
+++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
@@ -26,18 +26,18 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::add") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim d1 = ShardParallelDim{10, 2};
-    ShardParallelDim d2 = ShardParallelDim{15, 3};
+    ShardParallelDim d1 = ShardParallelDim{10_n, 2_n};
+    ShardParallelDim d2 = ShardParallelDim{15_n, 3_n};
 
     ParallelTensorShape lhs_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{15, 3},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{15_n, 3_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{2},
-                DiscardCopyDegree{1},
+                SumDegree{2_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -76,18 +76,18 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::batch_matmul") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{4, 2};
+    ShardParallelDim batch_dim = ShardParallelDim{4_n, 2_n};
 
     ParallelTensorShape a_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
                 batch_dim,
-                ShardParallelDim{10, 1},
-                ShardParallelDim{15, 3},
+                ShardParallelDim{10_n, 1_n},
+                ShardParallelDim{15_n, 3_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -97,12 +97,12 @@ TEST_SUITE(FF_TEST_SUITE) {
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
                 batch_dim,
-                ShardParallelDim{15, 3},
-                ShardParallelDim{12, 1},
+                ShardParallelDim{15_n, 3_n},
+                ShardParallelDim{12_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -130,7 +130,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("op attrs") {
       PCGOperatorAttrs result = get_parallel_layer_attrs(b.pcg, layer).op_attrs;
-      PCGOperatorAttrs correct = PCGOperatorAttrs{BatchMatmulAttrs{-1, -1}};
+      PCGOperatorAttrs correct =
+          PCGOperatorAttrs{BatchMatmulAttrs{std::nullopt, std::nullopt}};
       CHECK(result == correct);
     }
   }
@@ -141,12 +142,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{12, 1},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{12_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{3},
-                DiscardCopyDegree{1},
+                SumDegree{3_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -179,28 +180,28 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::conv2d") {
     ParallelComputationGraphBuilder b;
 
-    size_t batch_size = 2;
+    nonnegative_int batch_size = 2_n;
 
     TensorShape unpar_input_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{batch_size, 3, 10, 10}},
+        TensorDims{FFOrdered<nonnegative_int>{batch_size, 3_n, 10_n, 10_n}},
         DataType::FLOAT,
     };
 
-    ParallelTensorShape input_shape =
-        lift_to_parallel_with_degrees(unpar_input_shape,
-                                      SumDegree{1},
-                                      DiscardCopyDegree{1},
-                                      FFOrdered<int>{2, 1, 1, 1});
+    ParallelTensorShape input_shape = lift_to_parallel_with_degrees(
+        unpar_input_shape,
+        SumDegree{1_n},
+        DiscardCopyDegree{1_n},
+        FFOrdered<nonnegative_int>{2_n, 1_n, 1_n, 1_n});
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
 
-    int outChannels = 6;
-    int kernelH = 5;
-    int kernelW = 4;
-    int strideH = 3;
-    int strideW = 2;
-    int paddingH = 1;
-    int paddingW = 0;
+    nonnegative_int outChannels = 6_n;
+    nonnegative_int kernelH = 5_n;
+    nonnegative_int kernelW = 4_n;
+    nonnegative_int strideH = 3_n;
+    nonnegative_int strideW = 2_n;
+    nonnegative_int paddingH = 1_n;
+    nonnegative_int paddingW = 0_n;
     parallel_tensor_guid_t output = b.conv2d(input,
                                              /*outChannels=*/outChannels,
                                              /*kernelH=*/kernelH,
@@ -254,7 +255,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         strideW,
         paddingH,
         paddingW,
-        /*groups=*/1,
+        /*groups=*/1_n,
         /*activation=*/std::nullopt,
         /*use_bias=*/true,
     };
@@ -301,18 +302,18 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{16, 1},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{16_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
     };
 
-    int outDim = 14;
+    nonnegative_int outDim = 14_n;
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
     parallel_tensor_guid_t output = b.dense(input,
@@ -341,8 +342,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::embedding") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{12, 2};
-    ShardParallelDim feature_dim = ShardParallelDim{10, 1};
+    ShardParallelDim batch_dim = ShardParallelDim{12_n, 2_n};
+    ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n};
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
@@ -350,8 +351,8 @@ TEST_SUITE(FF_TEST_SUITE) {
                 feature_dim,
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::INT32,
@@ -359,8 +360,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
     parallel_tensor_guid_t output = b.embedding(input,
-                                                /*num_entries=*/32,
-                                                /*outDim=*/8,
+                                                /*num_entries=*/32_n,
+                                                /*outDim=*/8_n,
                                                 AggregateOp::SUM,
                                                 DataType::FLOAT);
     parallel_layer_guid_t layer = get_source_layer(output);
@@ -384,9 +385,9 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::multihead_attention") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{12, 2};
-    ShardParallelDim sequence_dim = ShardParallelDim{16, 1};
-    ShardParallelDim feature_dim = ShardParallelDim{10, 1};
+    ShardParallelDim batch_dim = ShardParallelDim{12_n, 2_n};
+    ShardParallelDim sequence_dim = ShardParallelDim{16_n, 1_n};
+    ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n};
     ParallelTensorShape query_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
@@ -395,8 +396,8 @@ TEST_SUITE(FF_TEST_SUITE) {
                 feature_dim,
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -405,8 +406,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape key_shape = query_shape;
     ParallelTensorShape value_shape = query_shape;
 
-    int embed_dim = 8;
-    int num_heads = 6;
+    nonnegative_int embed_dim = 8_n;
+    nonnegative_int num_heads = 6_n;
 
     parallel_tensor_guid_t query = b.create_input_tensor(query_shape);
     parallel_tensor_guid_t key = b.create_input_tensor(key_shape);
@@ -435,8 +436,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::relu") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{18, 3};
-    ShardParallelDim feature_dim = ShardParallelDim{32, 1};
+    ShardParallelDim batch_dim = ShardParallelDim{18_n, 3_n};
+    ShardParallelDim feature_dim = ShardParallelDim{32_n, 1_n};
 
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
@@ -445,8 +446,8 @@ TEST_SUITE(FF_TEST_SUITE) {
                 feature_dim,
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -474,8 +475,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::parallel_partition") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{18, 2};
-    ShardParallelDim feature_dim = ShardParallelDim{10, 1};
+    ShardParallelDim batch_dim = ShardParallelDim{18_n, 2_n};
+    ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n};
 
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
@@ -484,8 +485,8 @@ TEST_SUITE(FF_TEST_SUITE) {
                 feature_dim,
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -493,7 +494,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
     parallel_tensor_guid_t output =
-        b.parallel_partition(input, ff_dim_t{nonnegative_int{0}}, 2);
+        b.parallel_partition(input, ff_dim_t{nonnegative_int{0}}, 2_n);
     parallel_layer_guid_t layer = get_source_layer(output);
 
     SUBCASE("incoming") {
@@ -514,8 +515,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::parallel_combine") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{18, 2};
-    ShardParallelDim feature_dim = ShardParallelDim{10, 1};
+    ShardParallelDim batch_dim = ShardParallelDim{18_n, 2_n};
+    ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n};
 
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
@@ -524,8 +525,8 @@ TEST_SUITE(FF_TEST_SUITE) {
                 feature_dim,
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -533,7 +534,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
     parallel_tensor_guid_t output =
-        b.parallel_combine(input, ff_dim_t{nonnegative_int{0}}, 2);
+        b.parallel_combine(input, ff_dim_t{nonnegative_int{0}}, 2_n);
     parallel_layer_guid_t layer = get_source_layer(output);
 
     SUBCASE("incoming") {
@@ -554,8 +555,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::parallel_replicate") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{18, 2};
-    ShardParallelDim feature_dim = ShardParallelDim{10, 1};
+    ShardParallelDim batch_dim = ShardParallelDim{18_n, 2_n};
+    ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n};
 
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
@@ -564,15 +565,15 @@ TEST_SUITE(FF_TEST_SUITE) {
                 feature_dim,
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
-    parallel_tensor_guid_t output = b.parallel_replicate(input, 2);
+    parallel_tensor_guid_t output = b.parallel_replicate(input, 2_n);
     parallel_layer_guid_t layer = get_source_layer(output);
 
     SUBCASE("incoming") {
@@ -593,8 +594,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::parallel_reduce") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{18, 2};
-    ShardParallelDim feature_dim = ShardParallelDim{10, 1};
+    ShardParallelDim batch_dim = ShardParallelDim{18_n, 2_n};
+    ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n};
 
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
@@ -603,15 +604,15 @@ TEST_SUITE(FF_TEST_SUITE) {
                 feature_dim,
             },
             ReplicaParallelDimSet{
-                SumDegree{4},
-                DiscardCopyDegree{1},
+                SumDegree{4_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
-    parallel_tensor_guid_t output = b.parallel_reduce(input, 2);
+    parallel_tensor_guid_t output = b.parallel_reduce(input, 2_n);
     parallel_layer_guid_t layer = get_source_layer(output);
 
     SUBCASE("incoming") {
diff --git a/lib/pcg/test/src/pcg/start_invariant_machine_view.cc b/lib/pcg/test/src/pcg/start_invariant_machine_view.cc
index 8383754aa2..71c4d1b1d0 100644
--- a/lib/pcg/test/src/pcg/start_invariant_machine_view.cc
+++ b/lib/pcg/test/src/pcg/start_invariant_machine_view.cc
@@ -8,15 +8,15 @@ using namespace FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("StartInvariantMachineView - utility functions") {
     StartInvariantMachineView simv = StartInvariantMachineView{
-        {MachineViewDimension{stride_t{2},
+        {MachineViewDimension{stride_t{2_n},
                               MachineSpecificationDimension::INTER_NODE},
-         MachineViewDimension{stride_t{2},
+         MachineViewDimension{stride_t{2_n},
                               MachineSpecificationDimension::INTER_NODE}},
         DeviceType::GPU};
 
     SUBCASE("num_dims") {
-      int result = num_dims(simv);
-      int correct = 2;
+      nonnegative_int result = num_dims(simv);
+      nonnegative_int correct = 2_n;
       CHECK(result == correct);
     }
 
@@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("get_strides") {
       std::vector<stride_t> result = get_strides(simv);
-      std::vector<stride_t> correct = {stride_t{2}, stride_t{2}};
+      std::vector<stride_t> correct = {stride_t{2_n}, stride_t{2_n}};
       CHECK(result == correct);
     }
 
@@ -43,11 +43,11 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("StartInvariantMachineView - conversions") {
     MachineSpaceCoordinate start =
-        MachineSpaceCoordinate{1, 2, DeviceType::GPU};
+        MachineSpaceCoordinate{1_n, 2_n, DeviceType::GPU};
     std::vector<MachineViewDimension> dimensions = {
-        MachineViewDimension{stride_t{2},
+        MachineViewDimension{stride_t{2_n},
                              MachineSpecificationDimension::INTER_NODE},
-        MachineViewDimension{stride_t{3},
+        MachineViewDimension{stride_t{3_n},
                              MachineSpecificationDimension::INTRA_NODE}};
 
     MachineView mv = MachineView{start, dimensions};
@@ -94,21 +94,21 @@ TEST_SUITE(FF_TEST_SUITE) {
        *  | (0,)  |       | (1,)  |       | (2,)  |       |
        *  +-------+-------+-------+-------+-------+-------+
        */
-      OperatorTaskSpace task = OperatorTaskSpace{{3}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
       StartInvariantMachineView simv = StartInvariantMachineView{
-          {MachineViewDimension{stride_t{2},
+          {MachineViewDimension{stride_t{2_n},
                                 MachineSpecificationDimension::INTRA_NODE}},
           DeviceType::GPU};
       MachineSpecification ms =
-          MachineSpecification{/*num_nodes=*/1,
-                               /*num_cpus_per_node=*/6,
-                               /*num_gpus_per_node=*/6,
-                               /*inter_node_bandwidth=*/0,
-                               /*intra_node_bandwidth=*/0};
+          MachineSpecification{/*num_nodes=*/1_n,
+                               /*num_cpus_per_node=*/6_n,
+                               /*num_gpus_per_node=*/6_n,
+                               /*inter_node_bandwidth=*/0.0,
+                               /*intra_node_bandwidth=*/0.0};
 
       SUBCASE("get_machine_space_offset") {
         SUBCASE("Task with TaskSpaceCoordinate = (0,)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n}};
           MachineSpaceOffset correct =
               MachineSpaceOffset{0, 0, DeviceType::GPU};
           MachineSpaceOffset result =
@@ -117,7 +117,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n}};
           MachineSpaceOffset correct =
               MachineSpaceOffset{0, 2, DeviceType::GPU};
           MachineSpaceOffset result =
@@ -126,7 +126,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (2,)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{2}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{2_n}};
           MachineSpaceOffset correct =
               MachineSpaceOffset{0, 4, DeviceType::GPU};
           MachineSpaceOffset result =
@@ -162,23 +162,23 @@ TEST_SUITE(FF_TEST_SUITE) {
        *  +-------+-------+-------+-------+
        */
 
-      OperatorTaskSpace task = OperatorTaskSpace{{2, 2}};
+      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}};
       StartInvariantMachineView simv = StartInvariantMachineView{
-          {MachineViewDimension{stride_t{1},
+          {MachineViewDimension{stride_t{1_n},
                                 MachineSpecificationDimension::INTER_NODE},
-           MachineViewDimension{stride_t{2},
+           MachineViewDimension{stride_t{2_n},
                                 MachineSpecificationDimension::INTRA_NODE}},
           DeviceType::GPU};
       MachineSpecification ms =
-          MachineSpecification{/*num_nodes=*/2,
-                               /*num_cpus_per_node=*/4,
-                               /*num_gpus_per_node=*/4,
+          MachineSpecification{/*num_nodes=*/2_n,
+                               /*num_cpus_per_node=*/4_n,
+                               /*num_gpus_per_node=*/4_n,
                                /*inter_node_bandwidth=*/0,
                                /*intra_node_bandwidth=*/0};
 
       SUBCASE("get_machine_space_offset") {
         SUBCASE("Task with TaskSpaceCoordinate = (0,0)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 0_n}};
           MachineSpaceOffset correct =
               MachineSpaceOffset{0, 0, DeviceType::GPU};
           MachineSpaceOffset result =
@@ -187,7 +187,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (0,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 1_n}};
           MachineSpaceOffset correct =
               MachineSpaceOffset{0, 2, DeviceType::GPU};
           MachineSpaceOffset result =
@@ -196,7 +196,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,0)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 0_n}};
           MachineSpaceOffset correct =
               MachineSpaceOffset{1, 0, DeviceType::GPU};
           MachineSpaceOffset result =
@@ -205,7 +205,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 1_n}};
           MachineSpaceOffset correct =
               MachineSpaceOffset{1, 2, DeviceType::GPU};
           MachineSpaceOffset result =
diff --git a/lib/substitutions/include/substitutions/apply_substitution/apply_substitution.h b/lib/substitutions/include/substitutions/apply_substitution/apply_substitution.h
new file mode 100644
index 0000000000..92f7bb1c03
--- /dev/null
+++ b/lib/substitutions/include/substitutions/apply_substitution/apply_substitution.h
@@ -0,0 +1,31 @@
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_H
+
+#include "substitutions/pcg_pattern_match.dtg.h"
+#include "substitutions/sub_parallel_computation_graph.dtg.h"
+#include "substitutions/substitution.dtg.h"
+
+namespace FlexFlow {
+
+/**
+ * @brief Applies \p substitution to \p sub_pcg at the location specified by \p
+ * match, returning the resulting SubParallelComputationGraph
+ *
+ * @param sub_pcg
+ * @param substitution
+ * @param match The location at which to apply substitution. This location in
+ * sub_pcg should match substitution's PCGPattern. Likely created by running
+ * FlexFlow::find_pattern_matches(PCGPattern const &,
+ * SubParallelComputationGraph const &).
+ * @return SubParallelComputationGraph A sub-PCG similar to sub_pcg, but with
+ * the subgraph specified by match replaced with the result of the output
+ * expression of substitution
+ */
+SubParallelComputationGraph
+    apply_substitution(SubParallelComputationGraph const &sub_pcg,
+                       Substitution const &substitution,
+                       PCGPatternMatch const &match);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/substitutions/include/substitutions/substitution_internal/evaluate_substitution_output.h b/lib/substitutions/include/substitutions/apply_substitution/evaluate_substitution_output.h
similarity index 76%
rename from lib/substitutions/include/substitutions/substitution_internal/evaluate_substitution_output.h
rename to lib/substitutions/include/substitutions/apply_substitution/evaluate_substitution_output.h
index a0461b075b..74089c5aab 100644
--- a/lib/substitutions/include/substitutions/substitution_internal/evaluate_substitution_output.h
+++ b/lib/substitutions/include/substitutions/apply_substitution/evaluate_substitution_output.h
@@ -1,10 +1,10 @@
-#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_EVALUATE_SUBSTITUTION_OUTPUT_H
-#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_EVALUATE_SUBSTITUTION_OUTPUT_H
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_EVALUATE_SUBSTITUTION_OUTPUT_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_EVALUATE_SUBSTITUTION_OUTPUT_H
 
+#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.dtg.h"
 #include "substitutions/pcg_pattern_match.dtg.h"
 #include "substitutions/sub_parallel_computation_graph.dtg.h"
 #include "substitutions/substitution.dtg.h"
-#include "substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.dtg.h"
 #include <utility>
 
 namespace FlexFlow {
diff --git a/lib/substitutions/include/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.h b/lib/substitutions/include/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h
similarity index 62%
rename from lib/substitutions/include/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.h
rename to lib/substitutions/include/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h
index 603cb670bf..cd7e782909 100644
--- a/lib/substitutions/include/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.h
+++ b/lib/substitutions/include/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h
@@ -1,11 +1,11 @@
-#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_OUTPUT_EXPR_TO_RESULT_SUB_PCG_MAPPING_H
-#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_OUTPUT_EXPR_TO_RESULT_SUB_PCG_MAPPING_H
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_OUTPUT_EXPR_TO_RESULT_SUB_PCG_MAPPING_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_OUTPUT_EXPR_TO_RESULT_SUB_PCG_MAPPING_H
 
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
+#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.dtg.h"
 #include "substitutions/output_graph/output_graph_expr.dtg.h"
 #include "substitutions/output_graph/output_graph_expr_node_output.dtg.h"
 #include "substitutions/sub_parallel_computation_graph.dtg.h"
-#include "substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/substitutions/include/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.struct.toml b/lib/substitutions/include/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.struct.toml
similarity index 100%
rename from lib/substitutions/include/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.struct.toml
rename to lib/substitutions/include/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.struct.toml
diff --git a/lib/substitutions/include/substitutions/substitution_internal/perform_shape_inference.h b/lib/substitutions/include/substitutions/apply_substitution/perform_shape_inference.h
similarity index 85%
rename from lib/substitutions/include/substitutions/substitution_internal/perform_shape_inference.h
rename to lib/substitutions/include/substitutions/apply_substitution/perform_shape_inference.h
index b7ce13db0e..c3f9eff349 100644
--- a/lib/substitutions/include/substitutions/substitution_internal/perform_shape_inference.h
+++ b/lib/substitutions/include/substitutions/apply_substitution/perform_shape_inference.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_PERFORM_SHAPE_INFERENCE_H
-#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_PERFORM_SHAPE_INFERENCE_H
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_PERFORM_SHAPE_INFERENCE_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_PERFORM_SHAPE_INFERENCE_H
 
 #include "op-attrs/parallel_tensor_shape.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_layer_attrs.dtg.h"
diff --git a/lib/substitutions/include/substitutions/constraint_type.enum.toml b/lib/substitutions/include/substitutions/constraint_type.enum.toml
index 8646ba1c83..f366a17725 100644
--- a/lib/substitutions/include/substitutions/constraint_type.enum.toml
+++ b/lib/substitutions/include/substitutions/constraint_type.enum.toml
@@ -9,3 +9,6 @@ features = [
 
 [[values]]
 name = "EQUAL"
+
+[[values]]
+name = "DIVISIBLE_BY"
diff --git a/lib/substitutions/include/substitutions/operator_pattern/get_attribute_map.h b/lib/substitutions/include/substitutions/operator_pattern/get_attribute_map.h
new file mode 100644
index 0000000000..2b31dada04
--- /dev/null
+++ b/lib/substitutions/include/substitutions/operator_pattern/get_attribute_map.h
@@ -0,0 +1,15 @@
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OPERATOR_PATTERN_GET_ATTRIBUTE_MAP_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OPERATOR_PATTERN_GET_ATTRIBUTE_MAP_H
+
+#include "op-attrs/pcg_operator_attrs.dtg.h"
+#include "substitutions/operator_pattern/operator_attribute_key.dtg.h"
+#include "substitutions/operator_pattern/operator_attribute_value.dtg.h"
+
+namespace FlexFlow {
+
+std::unordered_map<OperatorAttributeKey, OperatorAttributeValue>
+    get_attribute_map(PCGOperatorAttrs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_constraint.h b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_constraint.h
index 4affdd697f..c2c11fac51 100644
--- a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_constraint.h
+++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_constraint.h
@@ -9,6 +9,8 @@ OperatorAttributeConstraint op_type_equals_constraint(OperatorType);
 
 OperatorAttributeConstraint op_attr_key_equals(OperatorAttributeKey,
                                                OperatorAttributeValue const &);
+OperatorAttributeConstraint
+    op_attr_key_divisible_by(OperatorAttributeKey, nonnegative_int denominator);
 OperatorAttributeConstraint
     make_equals_constraint(OperatorAttributeExpr const &,
                            OperatorAttributeValue const &);
diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.enum.toml b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.enum.toml
index eb758ea4fc..af3666d46f 100644
--- a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.enum.toml
+++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.enum.toml
@@ -56,6 +56,7 @@ values = [
   { name = "SHOULD_BROADCAST_RHS" },
   { name = "DIM" },
   { name = "AFFINE" },
+  { name = "ELEMENTWISE_AFFINE" },
   { name = "MOMENTUM" },
   { name = "REGULARIZER" },
   { name = "SHAPE" },
diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.h b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.h
new file mode 100644
index 0000000000..d46403a847
--- /dev/null
+++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OPERATOR_PATTERN_OPERATOR_ATTRIBUTE_KEY_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OPERATOR_PATTERN_OPERATOR_ATTRIBUTE_KEY_H
+
+#include "substitutions/operator_pattern/operator_attribute_key.dtg.h"
+
+namespace FlexFlow {
+
+std::vector<OperatorAttributeKey> all_operator_attribute_keys();
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_list_access.struct.toml b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_list_access.struct.toml
index bceff393d2..4ed226907e 100644
--- a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_list_access.struct.toml
+++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_list_access.struct.toml
@@ -10,7 +10,8 @@ features = [
 ]
 
 includes = [
-  "substitutions/operator_pattern/operator_attribute_key.dtg.h"
+  "substitutions/operator_pattern/operator_attribute_key.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -19,4 +20,4 @@ type = "::FlexFlow::OperatorAttributeKey"
 
 [[fields]]
 name = "index"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml
index 8fe4a9494d..3312b292a0 100644
--- a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml
+++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml
@@ -7,7 +7,6 @@ features = [
   "fmt",
   "json",
 ]
-explicit_constructors = false
 
 includes = [
   "<vector>",
@@ -21,6 +20,7 @@ includes = [
   "op-attrs/tensor_shape.dtg.h",
   "op-attrs/datatype.dtg.h",
   "<cstddef>",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -31,7 +31,7 @@ src_includes = [
 ]
 
 [[values]]
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[values]]
 type = "bool"
@@ -40,7 +40,10 @@ type = "bool"
 type = "float"
 
 [[values]]
-type = "std::vector<int>"
+type = "std::optional<float>"
+
+[[values]]
+type = "std::vector<::FlexFlow::nonnegative_int>"
 
 [[values]]
 type = "std::vector<::FlexFlow::ff_dim_t>"
@@ -55,10 +58,7 @@ type = "std::optional<::FlexFlow::Activation>"
 type = "::FlexFlow::ff_dim_t"
 
 [[values]]
-type = "size_t"
-
-[[values]]
-type = "::FlexFlow::AggregateOp"
+type = "std::optional<::FlexFlow::AggregateOp>"
 
 [[values]]
 type = "std::optional<::FlexFlow::RegularizerAttrs>"
diff --git a/lib/substitutions/include/substitutions/output_graph/output_graph_expr.h b/lib/substitutions/include/substitutions/output_graph/output_graph_expr.h
index e550767292..8c047fc44d 100644
--- a/lib/substitutions/include/substitutions/output_graph/output_graph_expr.h
+++ b/lib/substitutions/include/substitutions/output_graph/output_graph_expr.h
@@ -2,14 +2,19 @@
 #define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OUTPUT_GRAPH_OUTPUT_GRAPH_EXPR_H
 
 #include "substitutions/output_graph/output_graph_expr.dtg.h"
+#include "substitutions/output_graph/output_graph_expr_input.dtg.h"
 #include "substitutions/output_graph/output_graph_expr_node.dtg.h"
 #include "substitutions/output_graph/output_graph_expr_node_output.dtg.h"
 
 namespace FlexFlow {
 
+std::unordered_set<OutputGraphExprNode> get_nodes(OutputGraphExpr const &);
+
 std::vector<OutputGraphExprNodeOutput>
     get_node_outputs(OutputGraphExpr const &, OutputGraphExprNode const &);
 
+std::unordered_set<OutputGraphExprInput> get_inputs(OutputGraphExpr const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.h b/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.h
new file mode 100644
index 0000000000..e172edb025
--- /dev/null
+++ b/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OUTPUT_GRAPH_OUTPUT_GRAPH_EXPR_VALUE_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OUTPUT_GRAPH_OUTPUT_GRAPH_EXPR_VALUE_H
+
+#include "substitutions/output_graph/output_graph_expr_value.dtg.h"
+#include "utils/graph/open_dataflow_graph/open_dataflow_value.dtg.h"
+
+namespace FlexFlow {
+
+OpenDataflowValue raw_open_dataflow_value_from_output_graph_expr_value(
+    OutputGraphExprValue const &);
+OutputGraphExprValue output_graph_expr_value_from_raw_open_dataflow_value(
+    OpenDataflowValue const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.variant.toml b/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.variant.toml
new file mode 100644
index 0000000000..641250e1f0
--- /dev/null
+++ b/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.variant.toml
@@ -0,0 +1,19 @@
+namespace = "FlexFlow"
+name = "OutputGraphExprValue"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "substitutions/output_graph/output_graph_expr_input.dtg.h",
+  "substitutions/output_graph/output_graph_expr_node_output.dtg.h",
+]
+
+[[values]]
+type = "::FlexFlow::OutputGraphExprNodeOutput"
+
+[[values]]
+type = "::FlexFlow::OutputGraphExprInput"
diff --git a/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.h b/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.h
index 60540c0711..0921569d62 100644
--- a/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.h
+++ b/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.h
@@ -20,6 +20,9 @@ std::pair<OperatorAttributeKey, OutputOperatorAttributeExpr>
     set_attr_to_constant(OperatorAttributeKey key,
                          OperatorAttributeValue const &value);
 
+std::pair<OperatorAttributeKey, OutputOperatorAttributeExpr>
+    set_op_type_attr(OperatorType);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.struct.toml b/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.struct.toml
index d712ea96f7..483f27791a 100644
--- a/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.struct.toml
+++ b/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.struct.toml
@@ -18,11 +18,12 @@ includes = [
 src_includes = [
   "utils/hash/unordered_map.h",
   "utils/fmt/unordered_map.h",
+  "utils/fmt/optional.h",
 ]
 
-# [[fields]]
-# name = "clone_operator"
-# type = "std::optional<PatternNode>"
+[[fields]]
+name = "template_operator"
+type = "std::optional<::FlexFlow::PatternNode>"
 
 # NOTE(@wmdi): Not sure if it aligns with other design. Or alternatively we can
 # define the assignment for each operator type.
diff --git a/lib/substitutions/include/substitutions/pcg_pattern.h b/lib/substitutions/include/substitutions/pcg_pattern.h
index 7342e8169f..f0962b15c2 100644
--- a/lib/substitutions/include/substitutions/pcg_pattern.h
+++ b/lib/substitutions/include/substitutions/pcg_pattern.h
@@ -10,6 +10,8 @@
 
 namespace FlexFlow {
 
+std::unordered_set<PatternNode> get_nodes(PCGPattern const &);
+
 /**
  * @brief Find all locations in \p pcg that match \p pattern
  */
diff --git a/lib/substitutions/include/substitutions/pcg_pattern_match.h b/lib/substitutions/include/substitutions/pcg_pattern_match.h
index 388377d70c..b946173422 100644
--- a/lib/substitutions/include/substitutions/pcg_pattern_match.h
+++ b/lib/substitutions/include/substitutions/pcg_pattern_match.h
@@ -6,7 +6,7 @@
 #include "substitutions/pcg_pattern_match.dtg.h"
 #include "substitutions/sub_parallel_computation_graph.dtg.h"
 #include "substitutions/unlabelled/pattern_node_output.dtg.h"
-#include "substitutions/unlabelled/unlabelled_dataflow_graph_pattern_match.dtg.h"
+#include "substitutions/unlabelled/unlabelled_dataflow_graph_pattern_match.h"
 
 namespace FlexFlow {
 
@@ -17,7 +17,7 @@ bidict<PatternNodeOutput, parallel_tensor_guid_t>
         SubParallelComputationGraph const &spcg);
 
 UnlabelledDataflowGraphPatternMatch
-    get_unlabelled_pattern_match(PCGPatternMatch const &);
+    get_unlabelled_pattern_match(PCGPatternMatch const &match);
 
 } // namespace FlexFlow
 
diff --git a/lib/substitutions/include/substitutions/sub_parallel_computation_graph_edge.h b/lib/substitutions/include/substitutions/sub_parallel_computation_graph_edge.h
index 15cbb6127c..c0544abe1b 100644
--- a/lib/substitutions/include/substitutions/sub_parallel_computation_graph_edge.h
+++ b/lib/substitutions/include/substitutions/sub_parallel_computation_graph_edge.h
@@ -12,7 +12,7 @@ namespace FlexFlow {
 SubParallelComputationGraphEdge
     subpcg_edge_from_tensor_and_dst(parallel_tensor_guid_t const &tensor,
                                     parallel_layer_guid_t const &layer,
-                                    int input_idx);
+                                    nonnegative_int input_idx);
 SubParallelComputationGraphEdge
     subpcg_edge_from_tensor_and_use(open_parallel_tensor_guid_t const &tensor,
                                     parallel_tensor_use_t const &use);
diff --git a/lib/substitutions/include/substitutions/substitution.h b/lib/substitutions/include/substitutions/substitution.h
index 7b4e5e6912..7dc4e714ab 100644
--- a/lib/substitutions/include/substitutions/substitution.h
+++ b/lib/substitutions/include/substitutions/substitution.h
@@ -1,12 +1,14 @@
 #ifndef _FLEXFLOW_SUBSTITUTIONS_SUBSTITUTION_H
 #define _FLEXFLOW_SUBSTITUTIONS_SUBSTITUTION_H
 
-#include "substitutions/pcg_pattern_match.dtg.h"
-#include "substitutions/sub_parallel_computation_graph.dtg.h"
 #include "substitutions/substitution.dtg.h"
 
 namespace FlexFlow {
 
+bool is_isomorphic_to(Substitution const &, Substitution const &);
+
+std::string as_dot(Substitution const &);
+
 /**
  * @brief Checks that all internal invariants of the given substitution hold
  *
@@ -22,25 +24,6 @@ namespace FlexFlow {
  */
 bool is_valid_substitution(Substitution const &);
 
-/**
- * @brief Applies \p substitution to \p sub_pcg at the location specified by \p
- * match, returning the resulting SubParallelComputationGraph
- *
- * @param sub_pcg
- * @param substitution
- * @param match The location at which to apply substitution. This location in
- * sub_pcg should match substitution's PCGPattern. Likely created by running
- * FlexFlow::find_pattern_matches(PCGPattern const &,
- * SubParallelComputationGraph const &).
- * @return SubParallelComputationGraph A sub-PCG similar to sub_pcg, but with
- * the subgraph specified by match replaced with the result of the output
- * expression of substitution
- */
-SubParallelComputationGraph
-    apply_substitution(SubParallelComputationGraph const &sub_pcg,
-                       Substitution const &substitution,
-                       PCGPatternMatch const &match);
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/substitutions/include/substitutions/substitution_builder.h b/lib/substitutions/include/substitutions/substitution_builder.h
new file mode 100644
index 0000000000..1548b2269b
--- /dev/null
+++ b/lib/substitutions/include/substitutions/substitution_builder.h
@@ -0,0 +1,49 @@
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_BUILDER_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_BUILDER_H
+
+#include "substitutions/output_graph/output_graph_expr_value.dtg.h"
+#include "substitutions/substitution.dtg.h"
+#include "substitutions/unlabelled/pattern_value.dtg.h"
+#include <tl/expected.hpp>
+
+namespace FlexFlow {
+
+struct SubstitutionBuilder {
+public:
+  SubstitutionBuilder();
+
+  std::pair<PatternValue, OutputGraphExprValue>
+      add_input(TensorAttributePattern const &,
+                std::optional<std::string> const &name = std::nullopt);
+  void equate_outputs(PatternValue const &, OutputGraphExprValue const &);
+
+  std::vector<PatternValue> add_pattern_node(
+      OperatorAttributePattern const &node_pattern,
+      std::vector<PatternValue> const &inputs,
+      std::vector<TensorAttributePattern> const &output_patterns,
+      std::optional<std::string> const &name = std::nullopt);
+
+  std::vector<OutputGraphExprValue>
+      add_output_graph_node(OutputOperatorAttrsAssignment const &node_expr,
+                            std::vector<OutputGraphExprValue> const &inputs,
+                            nonnegative_int num_outputs);
+
+  PatternNode pattern_node_named(std::string const &) const;
+  PatternInput pattern_input_named(std::string const &) const;
+
+  Substitution get_substitution() const;
+
+private:
+  LabelledOpenDataflowGraph<OperatorAttributePattern, TensorAttributePattern>
+      pattern_g;
+  LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment, std::monostate>
+      output_g;
+  bidict<PatternInput, OutputGraphExprInput> input_mapping;
+  bidict<PatternNode, std::string> pattern_node_names;
+  bidict<PatternInput, std::string> pattern_input_names;
+  bidict<PatternNodeOutput, OutputGraphExprNodeOutput> output_mapping;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_list_access.struct.toml b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_list_access.struct.toml
index a57dd25845..71e11a12d5 100644
--- a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_list_access.struct.toml
+++ b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_list_access.struct.toml
@@ -10,7 +10,8 @@ features = [
 ]
 
 includes = [
-  "substitutions/tensor_pattern/tensor_attribute_key.dtg.h"
+  "substitutions/tensor_pattern/tensor_attribute_key.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -19,4 +20,4 @@ type = "::FlexFlow::TensorAttributeKey"
 
 [[fields]]
 name = "index"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_pattern.h b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_pattern.h
index 5b7ebf4ef8..c1e28f8d8f 100644
--- a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_pattern.h
+++ b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_pattern.h
@@ -2,10 +2,13 @@
 #define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_TENSOR_PATTERN_TENSOR_ATTRIBUTE_PATTERN_H
 
 #include "substitutions/tensor_pattern/tensor_attribute_pattern.dtg.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
 TensorAttributePattern tensor_attribute_pattern_match_all();
+TensorAttributePattern
+    tensor_attr_pattern_require_num_dims(nonnegative_int num_dims);
 
 } // namespace FlexFlow
 
diff --git a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_value.variant.toml b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_value.variant.toml
index 46b703a7fc..d2b931fb2d 100644
--- a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_value.variant.toml
+++ b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_value.variant.toml
@@ -12,10 +12,11 @@ includes = [
   "<vector>",
   "utils/hash/vector.h",
   "utils/fmt/vector.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[values]]
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[values]]
-type = "std::vector<size_t>"
+type = "std::vector<::FlexFlow::nonnegative_int>"
diff --git a/lib/substitutions/include/substitutions/unity_substitution_set.h b/lib/substitutions/include/substitutions/unity_substitution_set.h
new file mode 100644
index 0000000000..183f76ac8a
--- /dev/null
+++ b/lib/substitutions/include/substitutions/unity_substitution_set.h
@@ -0,0 +1,47 @@
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_UNITY_SUBSTITUTION_SET_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_UNITY_SUBSTITUTION_SET_H
+
+#include "pcg/machine_specification.dtg.h"
+#include "substitutions/substitution.dtg.h"
+#include "utils/fmt/vector.h"
+
+namespace FlexFlow {
+
+std::vector<Substitution>
+    get_substitution_set(MachineSpecification const &resources);
+
+Substitution create_combine_inception(nonnegative_int num_convs,
+                                      nonnegative_int num_dims,
+                                      nonnegative_int degree);
+Substitution create_combine_concat(nonnegative_int num_inputs,
+                                   nonnegative_int num_dims,
+                                   nonnegative_int degree);
+Substitution create_replicate_linear_combine(nonnegative_int num_dims,
+                                             nonnegative_int degree,
+                                             bool use_bias);
+Substitution create_partition_linear_combine(nonnegative_int num_dims,
+                                             nonnegative_int degree,
+                                             Activation activation,
+                                             bool use_bias);
+Substitution create_partition_conv2d_combine(nonnegative_int num_dims,
+                                             nonnegative_int degree);
+Substitution create_partition_attention_combine(nonnegative_int num_heads,
+                                                nonnegative_int degree);
+Substitution create_replicate_attention_reduce(nonnegative_int num_heads,
+                                               nonnegative_int degree);
+Substitution create_partition_add_combine(ff_dim_t parallel_dim,
+                                          nonnegative_int degree);
+Substitution create_partition_relu_combine(ff_dim_t parallel_dim,
+                                           nonnegative_int degree);
+Substitution create_partition_concat_combine(nonnegative_int num_inputs,
+                                             ff_dim_t concat_dim,
+                                             ff_dim_t parallel_dim,
+                                             nonnegative_int degree);
+Substitution create_partition_softmax_combine(ff_dim_t softmax_dim,
+                                              ff_dim_t partition_dim,
+                                              nonnegative_int degree);
+Substitution create_fuse_linear_activation(Activation activation);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/substitutions/include/substitutions/unlabelled/input_pattern_edge.h b/lib/substitutions/include/substitutions/unlabelled/input_pattern_edge.h
index 7a7c9c3c28..8c58cb991c 100644
--- a/lib/substitutions/include/substitutions/unlabelled/input_pattern_edge.h
+++ b/lib/substitutions/include/substitutions/unlabelled/input_pattern_edge.h
@@ -9,7 +9,7 @@ namespace FlexFlow {
 
 PatternInput get_src_input(InputPatternEdge const &);
 PatternNode get_dst_node(InputPatternEdge const &);
-int get_dst_idx(InputPatternEdge const &);
+nonnegative_int get_dst_idx(InputPatternEdge const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/substitutions/include/substitutions/unlabelled/pattern_matching.h b/lib/substitutions/include/substitutions/unlabelled/pattern_matching.h
index 14c0b9ddcc..ce30b18f55 100644
--- a/lib/substitutions/include/substitutions/unlabelled/pattern_matching.h
+++ b/lib/substitutions/include/substitutions/unlabelled/pattern_matching.h
@@ -9,13 +9,10 @@
 
 namespace FlexFlow {
 
-// OpenDataflowGraphView apply_match(UnlabelledGraphPattern const &pattern,
-//                                   UnlabelledDataflowGraphPatternMatch const
-//                                   &match);
-
 OpenDataflowSubgraphResult
     subgraph_matched(OpenDataflowGraphView const &graph,
                      UnlabelledDataflowGraphPatternMatch const &match);
+
 bool pattern_matches_subgraph_under(
     UnlabelledGraphPattern const &pattern,
     OpenDataflowGraphView const &subgraph,
@@ -30,11 +27,6 @@ bool unlabelled_pattern_does_match(
     UnlabelledDataflowGraphPatternMatch const &match,
     MatchAdditionalCriterion const &additional_criterion);
 
-std::vector<UnlabelledDataflowGraphPatternMatch>
-    find_pattern_matches(UnlabelledGraphPattern const &pattern,
-                         OpenDataflowGraphView const &graph,
-                         MatchAdditionalCriterion const &additional_criterion);
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/substitutions/include/substitutions/unlabelled/pattern_node_output.h b/lib/substitutions/include/substitutions/unlabelled/pattern_node_output.h
index 3dd5b262c9..67f513b8b1 100644
--- a/lib/substitutions/include/substitutions/unlabelled/pattern_node_output.h
+++ b/lib/substitutions/include/substitutions/unlabelled/pattern_node_output.h
@@ -6,7 +6,7 @@
 namespace FlexFlow {
 
 PatternNode get_src_node(PatternNodeOutput const &);
-int get_idx(PatternNodeOutput const &);
+nonnegative_int get_idx(PatternNodeOutput const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/substitutions/include/substitutions/unlabelled/standard_pattern_edge.h b/lib/substitutions/include/substitutions/unlabelled/standard_pattern_edge.h
index 7316098fb5..817e829709 100644
--- a/lib/substitutions/include/substitutions/unlabelled/standard_pattern_edge.h
+++ b/lib/substitutions/include/substitutions/unlabelled/standard_pattern_edge.h
@@ -8,8 +8,8 @@ namespace FlexFlow {
 
 PatternNode get_src_node(StandardPatternEdge const &);
 PatternNode get_dst_node(StandardPatternEdge const &);
-int get_src_idx(StandardPatternEdge const &);
-int get_dst_idx(StandardPatternEdge const &);
+nonnegative_int get_src_idx(StandardPatternEdge const &);
+nonnegative_int get_dst_idx(StandardPatternEdge const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc
new file mode 100644
index 0000000000..61bfe15d7b
--- /dev/null
+++ b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc
@@ -0,0 +1,165 @@
+#include "substitutions/apply_substitution/apply_substitution.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
+#include "substitutions/apply_substitution/evaluate_substitution_output.h"
+#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h"
+#include "substitutions/open_parallel_tensor_guid_t.h"
+#include "substitutions/pcg_pattern_match.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/sub_parallel_computation_graph_data.dtg.h"
+#include "substitutions/sub_parallel_computation_graph_edge.h"
+#include "utils/containers/keys.h"
+#include "utils/containers/merge_maps.h"
+#include "utils/containers/restrict_keys.h"
+#include "utils/containers/set_minus.h"
+#include "utils/containers/values.h"
+
+namespace FlexFlow {
+
+SubParallelComputationGraph
+    apply_substitution(SubParallelComputationGraph const &spcg,
+                       Substitution const &sub,
+                       PCGPatternMatch const &match) {
+  auto substitution_output_result =
+      evaluate_substitution_output(spcg, sub, match);
+  SubParallelComputationGraph substitution_output_graph =
+      substitution_output_result.first;
+  OutputExprToResultSubPCGMapping output_expr_to_result_sub_pcg_mapping =
+      substitution_output_result.second;
+
+  SubParallelComputationGraphData output_graph_data =
+      get_sub_pcg_data(substitution_output_graph);
+  SubParallelComputationGraphData pre_data = get_sub_pcg_data(spcg);
+
+  std::unordered_set<parallel_layer_guid_t> pre_nodes =
+      keys(pre_data.node_data);
+  std::unordered_set<parallel_layer_guid_t> matched_nodes =
+      unordered_set_of(values(match.node_assignment));
+  std::unordered_set<parallel_layer_guid_t> post_nodes_from_original_graph =
+      set_minus(pre_nodes, matched_nodes);
+
+  std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs> post_node_data =
+      [&] {
+        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+            post_node_data_from_orig = restrict_keys(
+                pre_data.node_data, post_nodes_from_original_graph);
+        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+            post_node_data_from_sub = output_graph_data.node_data;
+
+        return merge_disjoint_maps(post_node_data_from_orig,
+                                   post_node_data_from_sub);
+      }();
+
+  std::unordered_set<SubParallelComputationGraphEdge> post_edges = [&] {
+    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_orig =
+        filter(pre_data.edges, [&](SubParallelComputationGraphEdge const &e) {
+          if (e.raw_edge.has<DataflowInputEdge>()) {
+            return true;
+          } else {
+            DataflowEdge dfe = e.raw_edge.get<DataflowEdge>();
+            parallel_layer_guid_t src = parallel_layer_guid_t{dfe.src.node};
+            parallel_layer_guid_t dst = parallel_layer_guid_t{dfe.dst.node};
+            return !(contains(matched_nodes, src) ||
+                     contains(matched_nodes, dst));
+          }
+        });
+
+    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_sub =
+        filter(output_graph_data.edges,
+               [&](SubParallelComputationGraphEdge const &e) {
+                 return !e.raw_edge.has<DataflowInputEdge>();
+               });
+
+    bidict<PatternNodeOutput, parallel_tensor_guid_t>
+        output_orig_pattern_mapping = get_output_mapping_for_pcg_pattern_match(
+            match, sub.pcg_pattern, spcg);
+    bidict<parallel_tensor_guid_t, OutputGraphExprNodeOutput>
+        output_post_outexpr_mapping = get_output_graph_expr_output_mapping(
+            output_expr_to_result_sub_pcg_mapping,
+            sub.output_graph_expr,
+            substitution_output_graph);
+
+    std::unordered_set<SubParallelComputationGraphEdge> incoming_to_sub_edges;
+    for (auto const &[pattern_input, base_graph_tensor] :
+         match.input_assignment) {
+      OutputGraphExprInput output_expr_input =
+          sub.inputs_mapping.at_l(pattern_input);
+      input_parallel_tensor_guid_t output_graph_input =
+          output_expr_to_result_sub_pcg_mapping.input_mapping.at_r(
+              output_expr_input);
+      std::unordered_set<parallel_tensor_use_t> uses = get_parallel_tensor_uses(
+          substitution_output_graph,
+          open_parallel_tensor_guid_from_input(output_graph_input));
+      for (parallel_tensor_use_t const &use : uses) {
+        SubParallelComputationGraphEdge new_edge =
+            subpcg_edge_from_tensor_and_use(base_graph_tensor, use);
+        incoming_to_sub_edges.insert(new_edge);
+      }
+    }
+
+    std::unordered_set<SubParallelComputationGraphEdge> outgoing_from_sub_edges;
+    for (ParallelComputationGraphEdge const &outgoing_edge :
+         get_subgraph_outgoing_edges(spcg, matched_nodes)) {
+      parallel_tensor_guid_t original_tensor =
+          get_parallel_tensor(outgoing_edge);
+      PatternNodeOutput pattern_tensor =
+          output_orig_pattern_mapping.at_r(original_tensor);
+      OutputGraphExprNodeOutput output_graph_tensor =
+          sub.outputs_mapping.at_l(pattern_tensor);
+      parallel_tensor_guid_t new_tensor =
+          output_post_outexpr_mapping.at_r(output_graph_tensor);
+
+      SubParallelComputationGraphEdge new_edge =
+          subpcg_edge_from_tensor_and_dst(
+              new_tensor,
+              get_dst_layer(outgoing_edge),
+              get_dst_layer_input_idx(outgoing_edge));
+      outgoing_from_sub_edges.insert(new_edge);
+    }
+
+    return set_union(std::vector{
+        post_edges_from_orig,
+        post_edges_from_sub,
+        incoming_to_sub_edges,
+        outgoing_from_sub_edges,
+    });
+  }();
+
+  std::unordered_set<input_parallel_tensor_guid_t> post_inputs =
+      pre_data.inputs;
+
+  std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+      post_value_data = [&] {
+        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+            post_value_data_from_orig = filter_keys(
+                pre_data.value_data, [&](open_parallel_tensor_guid_t const &t) {
+                  return visit_open_parallel_tensor_guid(
+                      t,
+                      overload{
+                          [&](parallel_tensor_guid_t const &t) {
+                            return contains(post_nodes_from_original_graph,
+                                            get_source_layer(t));
+                          },
+                          [](input_parallel_tensor_guid_t const &) {
+                            return true;
+                          },
+                      });
+                });
+
+        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+            post_value_data_from_sub = output_graph_data.value_data;
+        return merge_disjoint_maps(post_value_data_from_orig,
+                                   post_value_data_from_sub);
+      }();
+
+  SubParallelComputationGraphData post_data = SubParallelComputationGraphData{
+      post_node_data,
+      post_edges,
+      post_inputs,
+      post_value_data,
+  };
+
+  return sub_pcg_from_graph_data(post_data);
+}
+
+} // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/substitution_internal/evaluate_substitution_output.cc b/lib/substitutions/src/substitutions/apply_substitution/evaluate_substitution_output.cc
similarity index 96%
rename from lib/substitutions/src/substitutions/substitution_internal/evaluate_substitution_output.cc
rename to lib/substitutions/src/substitutions/apply_substitution/evaluate_substitution_output.cc
index 186e2fc03a..a921201c3a 100644
--- a/lib/substitutions/src/substitutions/substitution_internal/evaluate_substitution_output.cc
+++ b/lib/substitutions/src/substitutions/apply_substitution/evaluate_substitution_output.cc
@@ -1,7 +1,7 @@
-#include "substitutions/substitution_internal/evaluate_substitution_output.h"
+#include "substitutions/apply_substitution/evaluate_substitution_output.h"
+#include "substitutions/apply_substitution/perform_shape_inference.h"
 #include "substitutions/output_graph/output_operator_attrs_assignment.h"
 #include "substitutions/sub_parallel_computation_graph.h"
-#include "substitutions/substitution_internal/perform_shape_inference.h"
 #include "utils/containers/map_keys.h"
 #include "utils/containers/map_values.h"
 #include "utils/graph/labelled_open_dataflow_graph/algorithms/permute_input_ids.h"
diff --git a/lib/substitutions/src/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.cc b/lib/substitutions/src/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.cc
similarity index 93%
rename from lib/substitutions/src/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.cc
rename to lib/substitutions/src/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.cc
index 22e6a9f333..a5fc9a2e06 100644
--- a/lib/substitutions/src/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.cc
+++ b/lib/substitutions/src/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.cc
@@ -1,4 +1,4 @@
-#include "substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.h"
+#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h"
 #include "substitutions/output_graph/output_graph_expr.h"
 #include "substitutions/sub_parallel_computation_graph.h"
 #include "utils/bidict/algorithms/bidict_from_keys_and_values.h"
diff --git a/lib/substitutions/src/substitutions/substitution_internal/perform_shape_inference.cc b/lib/substitutions/src/substitutions/apply_substitution/perform_shape_inference.cc
similarity index 95%
rename from lib/substitutions/src/substitutions/substitution_internal/perform_shape_inference.cc
rename to lib/substitutions/src/substitutions/apply_substitution/perform_shape_inference.cc
index 9fa91d75b7..f49c7e0a3e 100644
--- a/lib/substitutions/src/substitutions/substitution_internal/perform_shape_inference.cc
+++ b/lib/substitutions/src/substitutions/apply_substitution/perform_shape_inference.cc
@@ -1,4 +1,4 @@
-#include "substitutions/substitution_internal/perform_shape_inference.h"
+#include "substitutions/apply_substitution/perform_shape_inference.h"
 #include "op-attrs/get_output_shapes.h"
 #include "utils/containers/map_keys.h"
 #include "utils/containers/transform.h"
diff --git a/lib/substitutions/src/substitutions/operator_pattern/eval_list_access.cc b/lib/substitutions/src/substitutions/operator_pattern/eval_list_access.cc
index 53973dc1cb..6f41772a9e 100644
--- a/lib/substitutions/src/substitutions/operator_pattern/eval_list_access.cc
+++ b/lib/substitutions/src/substitutions/operator_pattern/eval_list_access.cc
@@ -1,5 +1,8 @@
 #include "substitutions/operator_pattern/eval_list_access.h"
 #include "substitutions/operator_pattern/get_attribute.h"
+#include "utils/containers/at_idx.h"
+#include "utils/containers/make.h"
+#include "utils/containers/transform.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
@@ -18,20 +21,12 @@ std::optional<OperatorAttributeValue>
       [&](auto const &v) -> std::optional<OperatorAttributeValue> {
         using T = std::decay_t<decltype(v)>;
 
-        if constexpr (std::is_same_v<T, std::vector<int>>) {
-          if (acc.index >= v.size()) {
-            return std::nullopt;
-          } else {
-            int value = v.at(acc.index);
-            return OperatorAttributeValue{value};
-          }
+        if constexpr (std::is_same_v<T, std::vector<nonnegative_int>>) {
+          return transform(at_idx(v, acc.index),
+                           make<OperatorAttributeValue>());
         } else if constexpr (std::is_same_v<T, std::vector<ff_dim_t>>) {
-          if (acc.index >= v.size()) {
-            return std::nullopt;
-          } else {
-            ff_dim_t value = v.at(acc.index);
-            return OperatorAttributeValue{value};
-          }
+          return transform(at_idx(v, acc.index),
+                           make<OperatorAttributeValue>());
         } else {
           throw mk_runtime_error("Invalid operand");
         }
diff --git a/lib/substitutions/src/substitutions/operator_pattern/eval_list_size.cc b/lib/substitutions/src/substitutions/operator_pattern/eval_list_size.cc
index a3ae9c84d1..fb0fd7f47b 100644
--- a/lib/substitutions/src/substitutions/operator_pattern/eval_list_size.cc
+++ b/lib/substitutions/src/substitutions/operator_pattern/eval_list_size.cc
@@ -1,5 +1,6 @@
 #include "substitutions/operator_pattern/eval_list_size.h"
 #include "substitutions/operator_pattern/get_attribute.h"
+#include "utils/nonnegative_int/num_elements.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
@@ -18,9 +19,9 @@ std::optional<OperatorAttributeValue>
       [&](auto const &v) -> std::optional<OperatorAttributeValue> {
         using T = std::decay_t<decltype(v)>;
 
-        if constexpr (std::is_same_v<T, std::vector<int>> ||
+        if constexpr (std::is_same_v<T, std::vector<nonnegative_int>> ||
                       std::is_same_v<T, std::vector<ff_dim_t>>) {
-          size_t size = v.size();
+          nonnegative_int size = num_elements(v);
           return OperatorAttributeValue{size};
         } else {
           throw mk_runtime_error("Invalid operand");
diff --git a/lib/substitutions/src/substitutions/operator_pattern/get_attribute.cc b/lib/substitutions/src/substitutions/operator_pattern/get_attribute.cc
index 442d3345a1..cb733e16ff 100644
--- a/lib/substitutions/src/substitutions/operator_pattern/get_attribute.cc
+++ b/lib/substitutions/src/substitutions/operator_pattern/get_attribute.cc
@@ -8,7 +8,7 @@ std::optional<OperatorAttributeValue> get_attribute(BatchMatmulAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -18,13 +18,13 @@ std::optional<OperatorAttributeValue> get_attribute(BatchNormAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::EPSILON:
-      return p.eps;
+      return OperatorAttributeValue{p.eps};
     case OperatorAttributeKey::AFFINE:
-      return p.affine;
+      return OperatorAttributeValue{p.affine};
     case OperatorAttributeKey::MOMENTUM:
-      return p.momentum;
+      return OperatorAttributeValue{p.momentum};
     default:
       return std::nullopt;
   }
@@ -34,9 +34,9 @@ std::optional<OperatorAttributeValue> get_attribute(BroadcastAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::TARGET_DIMS:
-      return p.target_dims;
+      return OperatorAttributeValue{p.target_dims};
     default:
       return std::nullopt;
   }
@@ -46,9 +46,9 @@ std::optional<OperatorAttributeValue> get_attribute(CastAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::DATA_TYPE:
-      return p.dtype;
+      return OperatorAttributeValue{p.dtype};
     default:
       return std::nullopt;
   }
@@ -58,11 +58,11 @@ std::optional<OperatorAttributeValue> get_attribute(CombineAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::PARALLEL_OP_DIM:
-      return p.combine_dim;
+      return OperatorAttributeValue{p.combine_dim};
     case OperatorAttributeKey::PARALLEL_DIM:
-      return p.combine_degree;
+      return OperatorAttributeValue{p.combine_degree};
     default:
       return std::nullopt;
   }
@@ -72,9 +72,9 @@ std::optional<OperatorAttributeValue> get_attribute(ConcatAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::AXIS:
-      return p.axis;
+      return OperatorAttributeValue{p.axis};
     default:
       return std::nullopt;
   }
@@ -84,25 +84,25 @@ std::optional<OperatorAttributeValue> get_attribute(Conv2DAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::KERNEL_H:
-      return p.kernel_h;
+      return OperatorAttributeValue{p.kernel_h};
     case OperatorAttributeKey::KERNEL_W:
-      return p.kernel_w;
+      return OperatorAttributeValue{p.kernel_w};
     case OperatorAttributeKey::STRIDE_H:
-      return p.stride_h;
+      return OperatorAttributeValue{p.stride_h};
     case OperatorAttributeKey::STRIDE_W:
-      return p.stride_w;
+      return OperatorAttributeValue{p.stride_w};
     case OperatorAttributeKey::PADDING_H:
-      return p.padding_h;
+      return OperatorAttributeValue{p.padding_h};
     case OperatorAttributeKey::PADDING_W:
-      return p.padding_w;
+      return OperatorAttributeValue{p.padding_w};
     case OperatorAttributeKey::GROUPS:
-      return p.groups;
+      return OperatorAttributeValue{p.groups};
     case OperatorAttributeKey::ACTIVATION:
-      return p.activation;
+      return OperatorAttributeValue{p.activation};
     case OperatorAttributeKey::USE_BIAS:
-      return p.use_bias;
+      return OperatorAttributeValue{p.use_bias};
     default:
       return std::nullopt;
   }
@@ -112,7 +112,7 @@ std::optional<OperatorAttributeValue> get_attribute(ElementBinaryAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -122,7 +122,7 @@ std::optional<OperatorAttributeValue> get_attribute(ElementUnaryAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -132,7 +132,7 @@ std::optional<OperatorAttributeValue> get_attribute(DropoutAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -142,15 +142,15 @@ std::optional<OperatorAttributeValue> get_attribute(EmbeddingAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::DATA_TYPE:
-      return p.data_type;
+      return OperatorAttributeValue{p.data_type};
     case OperatorAttributeKey::AGGR:
-      return p.aggr;
+      return OperatorAttributeValue{p.aggr};
     case OperatorAttributeKey::NUM_ENTRIES:
-      return p.num_entries;
+      return OperatorAttributeValue{p.num_entries};
     case OperatorAttributeKey::OUT_CHANNELS:
-      return p.out_channels;
+      return OperatorAttributeValue{p.out_channels};
     default:
       return std::nullopt;
   }
@@ -160,7 +160,7 @@ std::optional<OperatorAttributeValue> get_attribute(FlatAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -170,9 +170,9 @@ std::optional<OperatorAttributeValue> get_attribute(GatherAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::AXIS:
-      return p.dim;
+      return OperatorAttributeValue{p.dim};
     default:
       return std::nullopt;
   }
@@ -182,7 +182,7 @@ std::optional<OperatorAttributeValue> get_attribute(InputAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -192,11 +192,11 @@ std::optional<OperatorAttributeValue> get_attribute(LayerNormAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::AFFINE:
-      return p.elementwise_affine;
+      return OperatorAttributeValue{p.elementwise_affine};
     case OperatorAttributeKey::AXES:
-      return vector_of(p.axes);
+      return OperatorAttributeValue{vector_of(p.axes)};
     default:
       return std::nullopt;
   }
@@ -206,17 +206,17 @@ std::optional<OperatorAttributeValue> get_attribute(LinearAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::OUT_CHANNELS:
-      return p.out_channels;
+      return OperatorAttributeValue{p.out_channels};
     case OperatorAttributeKey::USE_BIAS:
-      return p.use_bias;
+      return OperatorAttributeValue{p.use_bias};
     case OperatorAttributeKey::DATA_TYPE:
-      return p.data_type;
+      return OperatorAttributeValue{p.data_type};
     case OperatorAttributeKey::ACTIVATION:
-      return p.activation;
+      return OperatorAttributeValue{p.activation};
     case OperatorAttributeKey::REGULARIZER:
-      return p.regularizer;
+      return OperatorAttributeValue{p.regularizer};
     default:
       return std::nullopt;
   }
@@ -226,13 +226,13 @@ std::optional<OperatorAttributeValue>
     get_attribute(MultiHeadAttentionAttrs const &p, OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::NUM_HEADS:
-      return p.num_heads;
+      return OperatorAttributeValue{p.num_heads};
     case OperatorAttributeKey::USE_BIAS:
-      return p.bias;
+      return OperatorAttributeValue{p.bias};
     case OperatorAttributeKey::DROPOUT:
-      return p.dropout;
+      return OperatorAttributeValue{p.dropout};
     default:
       return std::nullopt;
   }
@@ -242,7 +242,7 @@ std::optional<OperatorAttributeValue> get_attribute(NoopAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -252,23 +252,23 @@ std::optional<OperatorAttributeValue> get_attribute(Pool2DAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::KERNEL_H:
-      return p.kernel_h;
+      return OperatorAttributeValue{p.kernel_h};
     case OperatorAttributeKey::KERNEL_W:
-      return p.kernel_w;
+      return OperatorAttributeValue{p.kernel_w};
     case OperatorAttributeKey::STRIDE_H:
-      return p.stride_h;
+      return OperatorAttributeValue{p.stride_h};
     case OperatorAttributeKey::STRIDE_W:
-      return p.stride_w;
+      return OperatorAttributeValue{p.stride_w};
     case OperatorAttributeKey::PADDING_H:
-      return p.padding_h;
+      return OperatorAttributeValue{p.padding_h};
     case OperatorAttributeKey::PADDING_W:
-      return p.padding_w;
+      return OperatorAttributeValue{p.padding_w};
     case OperatorAttributeKey::POOL_TYPE:
-      return p.pool_type;
+      return OperatorAttributeValue{p.pool_type};
     case OperatorAttributeKey::ACTIVATION:
-      return std::optional<Activation>{p.activation};
+      return OperatorAttributeValue{p.activation};
     default:
       return std::nullopt;
   }
@@ -278,7 +278,7 @@ std::optional<OperatorAttributeValue> get_attribute(ReduceAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -288,9 +288,9 @@ std::optional<OperatorAttributeValue> get_attribute(ReductionAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::PARALLEL_OP_DEGREE:
-      return p.reduction_degree;
+      return OperatorAttributeValue{p.reduction_degree};
     default:
       return std::nullopt;
   }
@@ -300,11 +300,11 @@ std::optional<OperatorAttributeValue> get_attribute(RepartitionAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::PARALLEL_OP_DIM:
-      return p.repartition_dim;
+      return OperatorAttributeValue{p.repartition_dim};
     case OperatorAttributeKey::PARALLEL_OP_DEGREE:
-      return p.repartition_degree;
+      return OperatorAttributeValue{p.repartition_degree};
     default:
       return std::nullopt;
   }
@@ -314,9 +314,9 @@ std::optional<OperatorAttributeValue> get_attribute(ReplicateAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::PARALLEL_OP_DEGREE:
-      return p.replicate_degree;
+      return OperatorAttributeValue{p.replicate_degree};
     default:
       return std::nullopt;
   }
@@ -326,7 +326,7 @@ std::optional<OperatorAttributeValue> get_attribute(ReshapeAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -336,9 +336,9 @@ std::optional<OperatorAttributeValue> get_attribute(ReverseAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::AXIS:
-      return p.axis;
+      return OperatorAttributeValue{p.axis};
     default:
       return std::nullopt;
   }
@@ -348,9 +348,9 @@ std::optional<OperatorAttributeValue> get_attribute(SplitAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::AXIS:
-      return p.axis;
+      return OperatorAttributeValue{p.axis};
     default:
       return std::nullopt;
   }
@@ -360,9 +360,9 @@ std::optional<OperatorAttributeValue> get_attribute(SoftmaxAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::AXIS:
-      return p.dim;
+      return OperatorAttributeValue{p.dim};
     default:
       return std::nullopt;
   }
@@ -372,7 +372,7 @@ std::optional<OperatorAttributeValue> get_attribute(TopKAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -382,9 +382,9 @@ std::optional<OperatorAttributeValue> get_attribute(TransposeAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::PERMUTATION:
-      return vector_of(p.perm);
+      return OperatorAttributeValue{vector_of(p.perm)};
     default:
       return std::nullopt;
   }
@@ -394,7 +394,7 @@ std::optional<OperatorAttributeValue> get_attribute(WeightAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
diff --git a/lib/substitutions/src/substitutions/operator_pattern/get_attribute_map.cc b/lib/substitutions/src/substitutions/operator_pattern/get_attribute_map.cc
new file mode 100644
index 0000000000..f1b7440aed
--- /dev/null
+++ b/lib/substitutions/src/substitutions/operator_pattern/get_attribute_map.cc
@@ -0,0 +1,25 @@
+#include "substitutions/operator_pattern/get_attribute_map.h"
+#include "substitutions/operator_pattern/get_attribute.h"
+#include "substitutions/operator_pattern/operator_attribute_key.dtg.h"
+#include "substitutions/operator_pattern/operator_attribute_key.h"
+#include "substitutions/operator_pattern/operator_attribute_value.dtg.h"
+
+namespace FlexFlow {
+
+std::unordered_map<OperatorAttributeKey, OperatorAttributeValue>
+    get_attribute_map(PCGOperatorAttrs const &op_attrs) {
+  std::unordered_map<OperatorAttributeKey, OperatorAttributeValue> result;
+
+  for (OperatorAttributeKey const &attr_key : all_operator_attribute_keys()) {
+    std::optional<OperatorAttributeValue> attr_value =
+        get_attribute(op_attrs, attr_key);
+
+    if (attr_value.has_value()) {
+      result.insert({attr_key, attr_value.value()});
+    }
+  }
+
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_constraint.cc b/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_constraint.cc
index 5ab528ed3d..29aef07e3a 100644
--- a/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_constraint.cc
+++ b/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_constraint.cc
@@ -20,6 +20,16 @@ OperatorAttributeConstraint
   };
 }
 
+OperatorAttributeConstraint
+    op_attr_key_divisible_by(OperatorAttributeKey key,
+                             nonnegative_int denominator) {
+  return OperatorAttributeConstraint{
+      ConstraintType::DIVISIBLE_BY,
+      OperatorAttributeExpr{key},
+      OperatorAttributeValue{denominator},
+  };
+}
+
 OperatorAttributeConstraint
     make_equals_constraint(OperatorAttributeExpr const &expr,
                            OperatorAttributeValue const &val) {
diff --git a/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_key.cc b/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_key.cc
new file mode 100644
index 0000000000..232d2c2f12
--- /dev/null
+++ b/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_key.cc
@@ -0,0 +1,68 @@
+#include "substitutions/operator_pattern/operator_attribute_key.h"
+
+namespace FlexFlow {
+
+// This should probably be integrated into proj,
+// tracked in https://github.com/flexflow/FlexFlow/issues/1478
+std::vector<OperatorAttributeKey> all_operator_attribute_keys() {
+  return {
+      OperatorAttributeKey::OP_TYPE,
+      OperatorAttributeKey::USE_BIAS,
+      OperatorAttributeKey::GROUPS,
+      OperatorAttributeKey::POOL_TYPE,
+      OperatorAttributeKey::KERNEL_H,
+      OperatorAttributeKey::KERNEL_W,
+      OperatorAttributeKey::DATA_TYPE,
+      OperatorAttributeKey::SCALAR,
+      OperatorAttributeKey::STRIDE_H,
+      OperatorAttributeKey::STRIDE_W,
+      OperatorAttributeKey::PADDING_H,
+      OperatorAttributeKey::PADDING_W,
+      OperatorAttributeKey::AGGR,
+      OperatorAttributeKey::NUM_ENTRIES,
+      OperatorAttributeKey::OUT_CHANNELS,
+      OperatorAttributeKey::ACTIVATION,
+      OperatorAttributeKey::NUMDIM,
+      OperatorAttributeKey::AXIS,
+      OperatorAttributeKey::PERMUTATION,
+      OperatorAttributeKey::OUTSHUFFLE,
+      OperatorAttributeKey::MERGE_GCONV_COUNT,
+      OperatorAttributeKey::AXES,
+      OperatorAttributeKey::KEEP_DIMS,
+      OperatorAttributeKey::EPSILON,
+      OperatorAttributeKey::PARALLEL_OP_DIM,
+      OperatorAttributeKey::PARALLEL_OP_DEGREE,
+      OperatorAttributeKey::SOFTMAX_DIM,
+      OperatorAttributeKey::NUM_HEADS,
+      OperatorAttributeKey::PARALLEL_DIM,
+      OperatorAttributeKey::PARALLEL_DEGREE,
+      OperatorAttributeKey::PAD,
+      OperatorAttributeKey::EMBED_DIM,
+      OperatorAttributeKey::KDIM,
+      OperatorAttributeKey::VDIM,
+      OperatorAttributeKey::DROPOUT,
+      OperatorAttributeKey::BIAS,
+      OperatorAttributeKey::ADD_BIAS_KV,
+      OperatorAttributeKey::ADD_ZERO_ATTN,
+      OperatorAttributeKey::A_SEQ_LENGTH_DIM,
+      OperatorAttributeKey::B_SEQ_LENGTH_DIM,
+      OperatorAttributeKey::RELU,
+      OperatorAttributeKey::TARGET_DIMS,
+      OperatorAttributeKey::RATE,
+      OperatorAttributeKey::SEED,
+      OperatorAttributeKey::SHOULD_BROADCAST_LHS,
+      OperatorAttributeKey::SHOULD_BROADCAST_RHS,
+      OperatorAttributeKey::DIM,
+      OperatorAttributeKey::ELEMENTWISE_AFFINE,
+      OperatorAttributeKey::REGULARIZER,
+      OperatorAttributeKey::SHAPE,
+      OperatorAttributeKey::SPLITS,
+      OperatorAttributeKey::K,
+      OperatorAttributeKey::SORTED,
+      OperatorAttributeKey::COMBINE_DIM,
+      OperatorAttributeKey::COMBINE_DEGREE,
+      OperatorAttributeKey::NUM_INPUTS,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc b/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc
index 7d65f687c8..4f11b343f8 100644
--- a/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc
+++ b/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc
@@ -33,10 +33,12 @@ PCGOperatorAttrs materialize_operator_from_attrs_map(
   switch (op_type) {
     case OperatorType::MULTIHEAD_ATTENTION:
       return PCGOperatorAttrs{MultiHeadAttentionAttrs{
-          /*embed_dim=*/acc.get<int>(OperatorAttributeKey::EMBED_DIM),
-          /*num_heads=*/acc.get<int>(OperatorAttributeKey::NUM_HEADS),
-          /*kdim=*/acc.get<int>(OperatorAttributeKey::KDIM),
-          /*vdim=*/acc.get<int>(OperatorAttributeKey::VDIM),
+          /*embed_dim=*/acc.get<nonnegative_int>(
+              OperatorAttributeKey::EMBED_DIM),
+          /*num_heads=*/
+          acc.get<nonnegative_int>(OperatorAttributeKey::NUM_HEADS),
+          /*kdim=*/acc.get<nonnegative_int>(OperatorAttributeKey::KDIM),
+          /*vdim=*/acc.get<nonnegative_int>(OperatorAttributeKey::VDIM),
           /*dropout=*/acc.get<float>(OperatorAttributeKey::DROPOUT),
           /*bias=*/acc.get<bool>(OperatorAttributeKey::BIAS),
           /*add_bias_kv=*/acc.get<bool>(OperatorAttributeKey::ADD_BIAS_KV),
@@ -44,12 +46,14 @@ PCGOperatorAttrs materialize_operator_from_attrs_map(
       }};
     case OperatorType::POOL2D:
       return PCGOperatorAttrs{Pool2DAttrs{
-          /*kernel_h=*/acc.get<int>(OperatorAttributeKey::KERNEL_H),
-          /*kernel_w=*/acc.get<int>(OperatorAttributeKey::KERNEL_W),
-          /*stride_h=*/acc.get<int>(OperatorAttributeKey::STRIDE_H),
-          /*stride_w=*/acc.get<int>(OperatorAttributeKey::STRIDE_W),
-          /*padding_h=*/acc.get<int>(OperatorAttributeKey::PADDING_H),
-          /*padding_w=*/acc.get<int>(OperatorAttributeKey::PADDING_W),
+          /*kernel_h=*/acc.get<nonnegative_int>(OperatorAttributeKey::KERNEL_H),
+          /*kernel_w=*/acc.get<nonnegative_int>(OperatorAttributeKey::KERNEL_W),
+          /*stride_h=*/acc.get<nonnegative_int>(OperatorAttributeKey::STRIDE_H),
+          /*stride_w=*/acc.get<nonnegative_int>(OperatorAttributeKey::STRIDE_W),
+          /*padding_h=*/
+          acc.get<nonnegative_int>(OperatorAttributeKey::PADDING_H),
+          /*padding_w=*/
+          acc.get<nonnegative_int>(OperatorAttributeKey::PADDING_W),
           /*pool_type=*/acc.get<PoolOp>(OperatorAttributeKey::POOL_TYPE),
           /*activation=*/
           acc.get<std::optional<Activation>>(OperatorAttributeKey::ACTIVATION)
@@ -62,7 +66,8 @@ PCGOperatorAttrs materialize_operator_from_attrs_map(
     case OperatorType::DROPOUT:
     case OperatorType::LINEAR:
       return PCGOperatorAttrs{LinearAttrs{
-          /*out_channels=*/acc.get<int>(OperatorAttributeKey::OUT_CHANNELS),
+          /*out_channels=*/acc.get<nonnegative_int>(
+              OperatorAttributeKey::OUT_CHANNELS),
           /*use_bias=*/acc.get<bool>(OperatorAttributeKey::USE_BIAS),
           /*data_type=*/acc.get<DataType>(OperatorAttributeKey::DATA_TYPE),
           /*activation=*/
diff --git a/lib/substitutions/src/substitutions/output_graph/output_graph_expr.cc b/lib/substitutions/src/substitutions/output_graph/output_graph_expr.cc
index 3d6aadc795..f6d1410a07 100644
--- a/lib/substitutions/src/substitutions/output_graph/output_graph_expr.cc
+++ b/lib/substitutions/src/substitutions/output_graph/output_graph_expr.cc
@@ -1,9 +1,18 @@
 #include "substitutions/output_graph/output_graph_expr.h"
 #include "utils/containers/transform.h"
 #include "utils/graph/dataflow_graph/algorithms.h"
+#include "utils/graph/node/algorithms.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.h"
 
 namespace FlexFlow {
 
+std::unordered_set<OutputGraphExprNode> get_nodes(OutputGraphExpr const &g) {
+  std::unordered_set<Node> raw_nodes = get_nodes(g.raw_graph);
+
+  return transform(raw_nodes,
+                   [](Node const &n) { return OutputGraphExprNode{n}; });
+}
+
 std::vector<OutputGraphExprNodeOutput>
     get_node_outputs(OutputGraphExpr const &g, OutputGraphExprNode const &n) {
   std::vector<DataflowOutput> raw_outputs =
@@ -14,4 +23,13 @@ std::vector<OutputGraphExprNodeOutput>
   });
 }
 
+std::unordered_set<OutputGraphExprInput> get_inputs(OutputGraphExpr const &g) {
+  std::unordered_set<DataflowGraphInput> raw_inputs =
+      get_open_dataflow_graph_inputs(g.raw_graph);
+
+  return transform(raw_inputs, [](DataflowGraphInput const &i) {
+    return OutputGraphExprInput{i};
+  });
+}
+
 } // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/output_graph/output_graph_expr_value.cc b/lib/substitutions/src/substitutions/output_graph/output_graph_expr_value.cc
new file mode 100644
index 0000000000..b35f3bbeae
--- /dev/null
+++ b/lib/substitutions/src/substitutions/output_graph/output_graph_expr_value.cc
@@ -0,0 +1,30 @@
+#include "substitutions/output_graph/output_graph_expr_value.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+OpenDataflowValue raw_open_dataflow_value_from_output_graph_expr_value(
+    OutputGraphExprValue const &v) {
+  return v.visit<OpenDataflowValue>(overload{
+      [](OutputGraphExprNodeOutput const &o) {
+        return OpenDataflowValue{o.raw_dataflow_output};
+      },
+      [](OutputGraphExprInput const &i) {
+        return OpenDataflowValue{i.raw_dataflow_graph_input};
+      },
+  });
+}
+
+OutputGraphExprValue output_graph_expr_value_from_raw_open_dataflow_value(
+    OpenDataflowValue const &v) {
+  return v.visit<OutputGraphExprValue>(overload{
+      [](DataflowOutput const &o) {
+        return OutputGraphExprValue{OutputGraphExprNodeOutput{o}};
+      },
+      [](DataflowGraphInput const &i) {
+        return OutputGraphExprValue{OutputGraphExprInput{i}};
+      },
+  });
+}
+
+} // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/output_graph/output_operator_attrs_assignment.cc b/lib/substitutions/src/substitutions/output_graph/output_operator_attrs_assignment.cc
index fa247cd151..f6b90ef054 100644
--- a/lib/substitutions/src/substitutions/output_graph/output_operator_attrs_assignment.cc
+++ b/lib/substitutions/src/substitutions/output_graph/output_operator_attrs_assignment.cc
@@ -1,7 +1,9 @@
 #include "substitutions/output_graph/output_operator_attrs_assignment.h"
+#include "substitutions/operator_pattern/get_attribute_map.h"
 #include "substitutions/output_graph/materialize_operator_from_attrs_map.h"
 #include "substitutions/output_graph/output_operator_attribute_expr.h"
 #include "utils/containers/map_values.h"
+#include "utils/containers/merge_maps.h"
 
 namespace FlexFlow {
 
@@ -12,14 +14,31 @@ OutputOperatorAttrsAssignment output_operator_clone_node(PatternNode const &) {
 PCGOperatorAttrs materialize_output_operator_from_attrs_assignment(
     OutputOperatorAttrsAssignment const &attrs_assignment,
     std::unordered_map<PatternNode, PCGOperatorAttrs> const &node_match) {
-  std::unordered_map<OperatorAttributeKey, OperatorAttributeValue> attr_map =
-      map_values(attrs_assignment.assignments,
-                 [&](OutputOperatorAttributeExpr const &expr) {
-                   return evaluate_output_operator_attribute_expr(expr,
-                                                                  node_match);
-                 });
-
-  return materialize_operator_from_attrs_map(attr_map);
+
+  std::unordered_map<OperatorAttributeKey, OperatorAttributeValue>
+      template_attrs_map = [&]()
+      -> std::unordered_map<OperatorAttributeKey, OperatorAttributeValue> {
+    if (attrs_assignment.template_operator.has_value()) {
+      PatternNode template_node = attrs_assignment.template_operator.value();
+      PCGOperatorAttrs template_op_attrs = node_match.at(template_node);
+      return get_attribute_map(template_op_attrs);
+    } else {
+      return {};
+    }
+  }();
+
+  std::unordered_map<OperatorAttributeKey, OperatorAttributeValue>
+      assignments_attrs_map = map_values(
+          attrs_assignment.assignments,
+          [&](OutputOperatorAttributeExpr const &expr) {
+            return evaluate_output_operator_attribute_expr(expr, node_match);
+          });
+
+  std::unordered_map<OperatorAttributeKey, OperatorAttributeValue>
+      joined_attrs_map =
+          merge_map_right_dominates(template_attrs_map, assignments_attrs_map);
+
+  return materialize_operator_from_attrs_map(joined_attrs_map);
 }
 
 std::pair<OperatorAttributeKey, OutputOperatorAttributeExpr>
@@ -39,4 +58,10 @@ std::pair<OperatorAttributeKey, OutputOperatorAttributeExpr>
   };
 }
 
+std::pair<OperatorAttributeKey, OutputOperatorAttributeExpr>
+    set_op_type_attr(OperatorType op_type) {
+  return set_attr_to_constant(OperatorAttributeKey::OP_TYPE,
+                              OperatorAttributeValue{op_type});
+}
+
 } // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/pcg_pattern.cc b/lib/substitutions/src/substitutions/pcg_pattern.cc
index e53877006d..a0af875848 100644
--- a/lib/substitutions/src/substitutions/pcg_pattern.cc
+++ b/lib/substitutions/src/substitutions/pcg_pattern.cc
@@ -3,13 +3,23 @@
 #include "substitutions/pcg_pattern_match.h"
 #include "substitutions/sub_parallel_computation_graph.h"
 #include "substitutions/tensor_pattern/satisfies_pattern.h"
+#include "substitutions/unlabelled/find_pattern_matches.h"
 #include "substitutions/unlabelled/pattern_value.h"
 #include "utils/containers/map_values.h"
 #include "utils/containers/transform.h"
 #include "utils/graph/dataflow_graph/algorithms.h"
+#include "utils/graph/node/algorithms.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_inputs.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.h"
 
 namespace FlexFlow {
 
+std::unordered_set<PatternNode> get_nodes(PCGPattern const &p) {
+  std::unordered_set<Node> raw_nodes = get_nodes(p.raw_graph);
+
+  return transform(raw_nodes, [](Node const &n) { return PatternNode{n}; });
+}
+
 static MatchAdditionalCriterion
     pcg_pattern_criteria(PCGPattern const &pattern,
                          SubParallelComputationGraph const &pcg) {
@@ -63,6 +73,14 @@ OperatorAttributePattern get_operator_pattern(PCGPattern const &p,
   return p.raw_graph.at(n.raw_node);
 }
 
+std::unordered_set<PatternInput> get_inputs(PCGPattern const &p) {
+  std::unordered_set<DataflowGraphInput> raw_inputs =
+      get_open_dataflow_graph_inputs(p.raw_graph);
+
+  return transform(raw_inputs,
+                   [](DataflowGraphInput const &i) { return PatternInput{i}; });
+}
+
 std::vector<PatternNodeOutput>
     get_pattern_node_outputs(PCGPattern const &pattern,
                              PatternNode const &node) {
diff --git a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
index 0c673f0a8a..83df74f21b 100644
--- a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
+++ b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
@@ -188,33 +188,34 @@ bool sub_pcgs_are_isomorphic(SubParallelComputationGraph const &lhs,
 }
 
 std::string as_dot(SubParallelComputationGraph const &spcg) {
-  std::function<std::string(ParallelLayerAttrs const &)> get_node_label =
-      [](ParallelLayerAttrs const &a) -> std::string {
-    RecordFormatter r = as_dot(a.op_attrs);
-
-    if (a.name.has_value()) {
-      RecordFormatter rr;
-      rr << "Name" << a.name.value();
-      r << rr;
-    }
-
-    std::ostringstream oss;
-    oss << r;
-    return oss.str();
-  };
-
-  std::function<std::string(ParallelTensorAttrs const &)> get_input_label =
-      [](ParallelTensorAttrs const &a) -> std::string {
-    RecordFormatter r;
-
-    r << fmt::to_string(a.shape);
-
-    std::ostringstream oss;
-    oss << r;
-    return oss.str();
-  };
-
-  return as_dot(spcg.raw_graph, get_node_label, get_input_label);
+  NOT_IMPLEMENTED();
+  // std::function<std::string(ParallelLayerAttrs const &)> get_node_label =
+  //     [](ParallelLayerAttrs const &a) -> std::string {
+  //   RecordFormatter r = as_dot(a.op_attrs);
+  //
+  //   if (a.name.has_value()) {
+  //     RecordFormatter rr;
+  //     rr << "Name" << a.name.value();
+  //     r << rr;
+  //   }
+  //
+  //   std::ostringstream oss;
+  //   oss << r;
+  //   return oss.str();
+  // };
+  //
+  // std::function<std::string(ParallelTensorAttrs const &)> get_input_label =
+  //     [](ParallelTensorAttrs const &a) -> std::string {
+  //   RecordFormatter r;
+  //
+  //   r << fmt::to_string(a.shape);
+  //
+  //   std::ostringstream oss;
+  //   oss << r;
+  //   return oss.str();
+  // };
+  //
+  // return as_dot(spcg.raw_graph, get_node_label, get_input_label);
 }
 
 void debug_print_dot(SubParallelComputationGraph const &spcg) {
diff --git a/lib/substitutions/src/substitutions/sub_parallel_computation_graph_edge.cc b/lib/substitutions/src/substitutions/sub_parallel_computation_graph_edge.cc
index bb8cb449bc..0d2b912049 100644
--- a/lib/substitutions/src/substitutions/sub_parallel_computation_graph_edge.cc
+++ b/lib/substitutions/src/substitutions/sub_parallel_computation_graph_edge.cc
@@ -6,7 +6,7 @@ namespace FlexFlow {
 SubParallelComputationGraphEdge
     subpcg_edge_from_tensor_and_dst(parallel_tensor_guid_t const &tensor,
                                     parallel_layer_guid_t const &layer,
-                                    int input_idx) {
+                                    nonnegative_int input_idx) {
   return SubParallelComputationGraphEdge{
       OpenDataflowEdge{
           DataflowEdge{
diff --git a/lib/substitutions/src/substitutions/substitution.cc b/lib/substitutions/src/substitutions/substitution.cc
index 22e15cb01a..874700d303 100644
--- a/lib/substitutions/src/substitutions/substitution.cc
+++ b/lib/substitutions/src/substitutions/substitution.cc
@@ -1,169 +1,164 @@
 #include "substitutions/substitution.h"
-#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
-#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
-#include "substitutions/open_parallel_tensor_guid_t.h"
-#include "substitutions/output_graph/output_operator_attrs_assignment.h"
-#include "substitutions/pcg_pattern_match.h"
-#include "substitutions/sub_parallel_computation_graph.h"
-#include "substitutions/sub_parallel_computation_graph_edge.h"
-#include "substitutions/substitution_internal/evaluate_substitution_output.h"
-#include "substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.h"
-#include "utils/containers/merge_maps.h"
-#include "utils/containers/restrict_keys.h"
-#include "utils/containers/set_minus.h"
-#include "utils/containers/values.h"
-#include "utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.h"
-#include "utils/graph/node/algorithms.h"
-#include "utils/overload.h"
+#include "substitutions/output_graph/output_graph_expr.h"
+#include "substitutions/pcg_pattern.h"
+#include "utils/bidict/algorithms/left_entries.h"
+#include "utils/bidict/algorithms/right_entries.h"
+#include "utils/containers/map_values.h"
+#include "utils/graph/labelled_open_dataflow_graph/algorithms/find_isomorphism.h"
+#include "utils/graph/labelled_open_dataflow_graph/algorithms/rewrite_node_labels.h"
+#include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.dtg.h"
+#include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h"
 
 namespace FlexFlow {
 
-bool is_valid_substitution(Substitution const &) {
-  NOT_IMPLEMENTED();
-}
+bool is_isomorphic_to(Substitution const &l, Substitution const &r) {
+  OpenDataflowGraphIsomorphism pcg_pattern_isomorphism = ({
+    std::optional<OpenDataflowGraphIsomorphism> maybe_isomorphism =
+        find_isomorphism(l.pcg_pattern.raw_graph, r.pcg_pattern.raw_graph);
 
-SubParallelComputationGraph
-    apply_substitution(SubParallelComputationGraph const &spcg,
-                       Substitution const &sub,
-                       PCGPatternMatch const &match) {
-  auto substitution_output_result =
-      evaluate_substitution_output(spcg, sub, match);
-  SubParallelComputationGraph substitution_output_graph =
-      substitution_output_result.first;
-  OutputExprToResultSubPCGMapping output_expr_to_result_sub_pcg_mapping =
-      substitution_output_result.second;
-
-  SubParallelComputationGraphData output_graph_data =
-      get_sub_pcg_data(substitution_output_graph);
-  SubParallelComputationGraphData pre_data = get_sub_pcg_data(spcg);
-
-  std::unordered_set<parallel_layer_guid_t> pre_nodes =
-      keys(pre_data.node_data);
-  std::unordered_set<parallel_layer_guid_t> matched_nodes =
-      unordered_set_of(values(match.node_assignment));
-  std::unordered_set<parallel_layer_guid_t> post_nodes_from_original_graph =
-      set_minus(pre_nodes, matched_nodes);
-
-  std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs> post_node_data =
-      [&] {
-        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
-            post_node_data_from_orig = restrict_keys(
-                pre_data.node_data, post_nodes_from_original_graph);
-        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
-            post_node_data_from_sub = output_graph_data.node_data;
-
-        return merge_maps(post_node_data_from_orig, post_node_data_from_sub);
-      }();
-
-  std::unordered_set<SubParallelComputationGraphEdge> post_edges = [&] {
-    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_orig =
-        filter(pre_data.edges, [&](SubParallelComputationGraphEdge const &e) {
-          if (e.raw_edge.has<DataflowInputEdge>()) {
-            return true;
-          } else {
-            DataflowEdge dfe = e.raw_edge.get<DataflowEdge>();
-            parallel_layer_guid_t src = parallel_layer_guid_t{dfe.src.node};
-            parallel_layer_guid_t dst = parallel_layer_guid_t{dfe.dst.node};
-            return !(contains(matched_nodes, src) ||
-                     contains(matched_nodes, dst));
-          }
-        });
-
-    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_sub =
-        filter(output_graph_data.edges,
-               [&](SubParallelComputationGraphEdge const &e) {
-                 return !e.raw_edge.has<DataflowInputEdge>();
-               });
-
-    bidict<PatternNodeOutput, parallel_tensor_guid_t>
-        output_orig_pattern_mapping = get_output_mapping_for_pcg_pattern_match(
-            match, sub.pcg_pattern, spcg);
-    bidict<parallel_tensor_guid_t, OutputGraphExprNodeOutput>
-        output_post_outexpr_mapping = get_output_graph_expr_output_mapping(
-            output_expr_to_result_sub_pcg_mapping,
-            sub.output_graph_expr,
-            substitution_output_graph);
-
-    std::unordered_set<SubParallelComputationGraphEdge> incoming_to_sub_edges;
-    for (auto const &[pattern_input, base_graph_tensor] :
-         match.input_assignment) {
-      OutputGraphExprInput output_expr_input =
-          sub.inputs_mapping.at_l(pattern_input);
-      input_parallel_tensor_guid_t output_graph_input =
-          output_expr_to_result_sub_pcg_mapping.input_mapping.at_r(
-              output_expr_input);
-      std::unordered_set<parallel_tensor_use_t> uses = get_parallel_tensor_uses(
-          substitution_output_graph,
-          open_parallel_tensor_guid_from_input(output_graph_input));
-      for (parallel_tensor_use_t const &use : uses) {
-        SubParallelComputationGraphEdge new_edge =
-            subpcg_edge_from_tensor_and_use(base_graph_tensor, use);
-        incoming_to_sub_edges.insert(new_edge);
-      }
+    if (!maybe_isomorphism.has_value()) {
+      return false;
     }
 
-    std::unordered_set<SubParallelComputationGraphEdge> outgoing_from_sub_edges;
-    for (ParallelComputationGraphEdge const &outgoing_edge :
-         get_subgraph_outgoing_edges(spcg, matched_nodes)) {
-      parallel_tensor_guid_t original_tensor =
-          get_parallel_tensor(outgoing_edge);
-      PatternNodeOutput pattern_tensor =
-          output_orig_pattern_mapping.at_r(original_tensor);
-      OutputGraphExprNodeOutput output_graph_tensor =
-          sub.outputs_mapping.at_l(pattern_tensor);
-      parallel_tensor_guid_t new_tensor =
-          output_post_outexpr_mapping.at_r(output_graph_tensor);
-
-      SubParallelComputationGraphEdge new_edge =
-          subpcg_edge_from_tensor_and_dst(
-              new_tensor,
-              get_dst_layer(outgoing_edge),
-              get_dst_layer_input_idx(outgoing_edge));
-      outgoing_from_sub_edges.insert(new_edge);
-    }
+    maybe_isomorphism.value();
+  });
+
+  auto l_from_r_pattern_node = [&](PatternNode const &r_node) {
+    return PatternNode{
+        pcg_pattern_isomorphism.node_mapping.at_r(r_node.raw_node),
+    };
+  };
 
-    return set_union(std::vector{
-        post_edges_from_orig,
-        post_edges_from_sub,
-        incoming_to_sub_edges,
-        outgoing_from_sub_edges,
-    });
-  }();
-
-  std::unordered_set<input_parallel_tensor_guid_t> post_inputs =
-      pre_data.inputs;
-
-  std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
-      post_value_data = [&] {
-        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
-            post_value_data_from_orig = filter_keys(
-                pre_data.value_data, [&](open_parallel_tensor_guid_t const &t) {
-                  return visit_open_parallel_tensor_guid(
-                      t,
-                      overload{
-                          [&](parallel_tensor_guid_t const &t) {
-                            return contains(post_nodes_from_original_graph,
-                                            get_source_layer(t));
-                          },
-                          [](input_parallel_tensor_guid_t const &) {
-                            return true;
-                          },
-                      });
+  auto l_from_r_output_attrs_assignment =
+      [&](OutputOperatorAttrsAssignment const &r_attrs) {
+        std::optional<PatternNode> l_template_operator =
+            transform(r_attrs.template_operator, l_from_r_pattern_node);
+        std::unordered_map<OperatorAttributeKey, OutputOperatorAttributeExpr>
+            l_assignments = map_values(
+                r_attrs.assignments,
+                [&](OutputOperatorAttributeExpr const &r_expr) {
+                  return r_expr.visit<OutputOperatorAttributeExpr>(
+                      overload{[&](AttrConstant const &) { return r_expr; },
+                               [&](OutputOperatorAttrAccess const &r_acc) {
+                                 return OutputOperatorAttributeExpr{
+                                     OutputOperatorAttrAccess{
+                                         l_from_r_pattern_node(r_acc.node),
+                                         r_acc.attr_expr,
+                                     },
+                                 };
+                               }});
                 });
+        return OutputOperatorAttrsAssignment{
+            l_template_operator,
+            l_assignments,
+        };
+      };
+
+  OpenDataflowGraphIsomorphism output_graph_expr_isomorphism = ({
+    std::optional<OpenDataflowGraphIsomorphism> maybe_isomorphism =
+        find_isomorphism(
+            l.output_graph_expr.raw_graph,
+            rewrite_node_labels(
+                r.output_graph_expr.raw_graph,
+                [&](Node const &, OutputOperatorAttrsAssignment const &a) {
+                  return l_from_r_output_attrs_assignment(a);
+                }));
+    if (!maybe_isomorphism.has_value()) {
+      return false;
+    }
 
-        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
-            post_value_data_from_sub = output_graph_data.value_data;
-        return merge_maps(post_value_data_from_orig, post_value_data_from_sub);
-      }();
+    maybe_isomorphism.value();
+  });
 
-  SubParallelComputationGraphData post_data = SubParallelComputationGraphData{
-      post_node_data,
-      post_edges,
-      post_inputs,
-      post_value_data,
+  auto l_from_r_pattern_input = [&](PatternInput const &r_input) {
+    return PatternInput{
+        pcg_pattern_isomorphism.input_mapping.at_r(
+            r_input.raw_dataflow_graph_input),
+    };
   };
 
-  return sub_pcg_from_graph_data(post_data);
+  auto l_from_r_output_graph_input = [&](OutputGraphExprInput const &r_input) {
+    return OutputGraphExprInput{
+        output_graph_expr_isomorphism.input_mapping.at_r(
+            r_input.raw_dataflow_graph_input),
+    };
+  };
+
+  auto l_from_r_pattern_output = [&](PatternNodeOutput const &r_output) {
+    return PatternNodeOutput{
+        isomorphism_map_l_dataflow_output_from_r(pcg_pattern_isomorphism,
+                                                 r_output.raw_dataflow_output),
+    };
+  };
+
+  auto l_from_r_output_graph_output =
+      [&](OutputGraphExprNodeOutput const &r_output) {
+        return OutputGraphExprNodeOutput{
+            isomorphism_map_l_dataflow_output_from_r(
+                output_graph_expr_isomorphism, r_output.raw_dataflow_output),
+        };
+      };
+
+  bidict<PatternInput, OutputGraphExprInput> l_input_mapping_from_r =
+      transform(r.inputs_mapping,
+                [&](PatternInput const &r_p, OutputGraphExprInput const &r_o) {
+                  return std::pair{
+                      l_from_r_pattern_input(r_p),
+                      l_from_r_output_graph_input(r_o),
+                  };
+                });
+  if (l_input_mapping_from_r != l.inputs_mapping) {
+    return false;
+  }
+
+  bidict<PatternNodeOutput, OutputGraphExprNodeOutput> l_output_mapping_from_r =
+      transform(r.outputs_mapping,
+                [&](PatternNodeOutput const &r_p,
+                    OutputGraphExprNodeOutput const &r_o) {
+                  return std::pair{
+                      l_from_r_pattern_output(r_p),
+                      l_from_r_output_graph_output(r_o),
+                  };
+                });
+  if (l_output_mapping_from_r != l.outputs_mapping) {
+    return false;
+  }
+
+  return true;
+}
+
+bool is_valid_substitution(Substitution const &sub) {
+  {
+    std::unordered_set<PatternInput> pattern_inputs =
+        get_inputs(sub.pcg_pattern);
+    std::unordered_set<PatternInput> mapped_inputs =
+        left_entries(sub.inputs_mapping);
+
+    if (pattern_inputs != mapped_inputs) {
+      return false;
+    }
+  }
+
+  {
+    std::unordered_set<OutputGraphExprInput> output_graph_inputs =
+        get_inputs(sub.output_graph_expr);
+    std::unordered_set<OutputGraphExprInput> mapped_inputs =
+        right_entries(sub.inputs_mapping);
+
+    if (output_graph_inputs != mapped_inputs) {
+      return false;
+    }
+  }
+
+  if (get_nodes(sub.pcg_pattern).empty()) {
+    return false;
+  }
+
+  if (get_nodes(sub.output_graph_expr).empty()) {
+    return false;
+  }
+
+  return true;
 }
 
 } // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/substitution_builder.cc b/lib/substitutions/src/substitutions/substitution_builder.cc
new file mode 100644
index 0000000000..a267b8113f
--- /dev/null
+++ b/lib/substitutions/src/substitutions/substitution_builder.cc
@@ -0,0 +1,162 @@
+#include "substitutions/substitution_builder.h"
+#include "substitutions/output_graph/output_graph_expr_value.h"
+#include "substitutions/substitution.h"
+#include "substitutions/unlabelled/pattern_value.h"
+#include "utils/containers/repeat_element.h"
+#include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+SubstitutionBuilder::SubstitutionBuilder()
+    : pattern_g(LabelledOpenDataflowGraph<OperatorAttributePattern,
+                                          TensorAttributePattern>::
+                    create<UnorderedSetLabelledOpenDataflowGraph<
+                        OperatorAttributePattern,
+                        TensorAttributePattern>>()),
+      output_g(LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment,
+                                         std::monostate>::
+                   create<UnorderedSetLabelledOpenDataflowGraph<
+                       OutputOperatorAttrsAssignment,
+                       std::monostate>>()) {}
+
+std::pair<PatternValue, OutputGraphExprValue> SubstitutionBuilder::add_input(
+    TensorAttributePattern const &input_tensor_pattern,
+    std::optional<std::string> const &name) {
+  PatternInput pattern_input = PatternInput{
+      this->pattern_g.add_input(input_tensor_pattern),
+  };
+
+  OutputGraphExprInput output_graph_expr_input = OutputGraphExprInput{
+      this->output_g.add_input(std::monostate{}),
+  };
+
+  this->input_mapping.equate(pattern_input, output_graph_expr_input);
+
+  if (name.has_value()) {
+    this->pattern_input_names.equate(pattern_input, name.value());
+  }
+
+  return {
+      PatternValue{pattern_input},
+      OutputGraphExprValue{output_graph_expr_input},
+  };
+}
+
+std::vector<PatternValue> SubstitutionBuilder::add_pattern_node(
+    OperatorAttributePattern const &node_pattern,
+    std::vector<PatternValue> const &inputs,
+    std::vector<TensorAttributePattern> const &output_patterns,
+    std::optional<std::string> const &maybe_name) {
+  NodeAddedResult node_added = this->pattern_g.add_node(
+      node_pattern,
+      transform(inputs, raw_open_dataflow_value_from_pattern_value),
+      output_patterns);
+
+  if (maybe_name.has_value()) {
+    std::string name = maybe_name.value();
+
+    if (this->pattern_node_names.contains_r(name)) {
+      throw mk_runtime_error(fmt::format("Attempted to name node {}, but a "
+                                         "node with that name already exists!",
+                                         name));
+    }
+
+    this->pattern_node_names.equate(PatternNode{node_added.node}, name);
+  }
+
+  return transform(node_added.outputs, [](DataflowOutput const &o) {
+    return pattern_value_from_raw_open_dataflow_value(OpenDataflowValue{o});
+  });
+}
+
+std::vector<OutputGraphExprValue> SubstitutionBuilder::add_output_graph_node(
+    OutputOperatorAttrsAssignment const &node_expr,
+    std::vector<OutputGraphExprValue> const &inputs,
+    nonnegative_int num_outputs) {
+  NodeAddedResult node_added = this->output_g.add_node(
+      node_expr,
+      transform(inputs, raw_open_dataflow_value_from_output_graph_expr_value),
+      repeat_element(/*num_times=*/num_outputs, /*element=*/std::monostate{}));
+
+  return transform(node_added.outputs, [](DataflowOutput const &o) {
+    return output_graph_expr_value_from_raw_open_dataflow_value(
+        OpenDataflowValue{o});
+  });
+}
+
+void SubstitutionBuilder::equate_outputs(
+    PatternValue const &maybe_pattern_output,
+    OutputGraphExprValue const &maybe_output_graph_expr_output) {
+  PatternNodeOutput pattern_output =
+      maybe_pattern_output.visit<PatternNodeOutput>(overload{
+          [](PatternNodeOutput const &o) { return o; },
+          [&](PatternInput const &) -> PatternNodeOutput {
+            throw mk_runtime_error(fmt::format(
+                "SubstitutionBuilder::equate_outputs expected a PatternValue "
+                "holding a PatternNodeOutput, but received {}",
+                maybe_pattern_output));
+          },
+      });
+
+  OutputGraphExprNodeOutput output_graph_expr_output =
+      maybe_output_graph_expr_output.visit<OutputGraphExprNodeOutput>(overload{
+          [](OutputGraphExprNodeOutput const &o) { return o; },
+          [&](OutputGraphExprInput const &) -> OutputGraphExprNodeOutput {
+            throw mk_runtime_error(
+                fmt::format("SubstitutionBuilder::equate_outputs expected an "
+                            "OutputGraphExprValue holding a "
+                            "OutputGraphExprNodeOutput, but received {}",
+                            maybe_output_graph_expr_output));
+          },
+      });
+
+  if (this->output_mapping.contains_l(pattern_output)) {
+    throw mk_runtime_error(
+        fmt::format("SubstitutionBuilder::equate_outputs expected a "
+                    "PatternValue holding a PatternValueOutput"
+                    "that is not contained in the output_mapping forward graph,"
+                    "but received {}",
+                    pattern_output));
+  }
+  if (this->output_mapping.contains_r(output_graph_expr_output)) {
+    throw mk_runtime_error(fmt::format(
+        "SubstitutionBuilder::output_graph_expr_output expected a "
+        "OutputGraphExprValue holding a OutputGraphExprNodeOutput"
+        "that is not contained in the output_mapping backward graph,"
+        "but received {}",
+        output_graph_expr_output));
+  }
+
+  this->output_mapping.equate(pattern_output, output_graph_expr_output);
+}
+
+PatternNode
+    SubstitutionBuilder::pattern_node_named(std::string const &name) const {
+  return this->pattern_node_names.at_r(name);
+}
+
+PatternInput
+    SubstitutionBuilder::pattern_input_named(std::string const &name) const {
+  return this->pattern_input_names.at_r(name);
+}
+
+Substitution SubstitutionBuilder::get_substitution() const {
+  Substitution result = Substitution{
+      PCGPattern{this->pattern_g},
+      OutputGraphExpr{this->output_g},
+      this->input_mapping,
+      this->output_mapping,
+  };
+
+  if (!is_valid_substitution(result)) {
+    throw mk_runtime_error(
+        "get_substitution cannot return a Substitution, as the Substitution is "
+        "currently invalid. Ensure you have finished constructing the "
+        "Substitution and have mapped all of the outputs.");
+  }
+
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/tensor_pattern/eval_list_access.cc b/lib/substitutions/src/substitutions/tensor_pattern/eval_list_access.cc
index efbcf4a6f1..7bfb1f5e9e 100644
--- a/lib/substitutions/src/substitutions/tensor_pattern/eval_list_access.cc
+++ b/lib/substitutions/src/substitutions/tensor_pattern/eval_list_access.cc
@@ -11,9 +11,8 @@ TensorAttributeValue
   TensorAttributeValue from_attr = get_attribute(attrs, acc.attribute_key);
 
   return from_attr.visit<TensorAttributeValue>(overload{
-      [&](std::vector<int> const &v) -> TensorAttributeValue {
-        return TensorAttributeValue{
-            static_cast<size_t>(at_idx(v, acc.index).value())};
+      [&](std::vector<nonnegative_int> const &v) -> TensorAttributeValue {
+        return TensorAttributeValue{at_idx(v, acc.index).value()};
       },
       [](auto &&) -> TensorAttributeValue {
         throw mk_runtime_error("Invalid operand");
diff --git a/lib/substitutions/src/substitutions/tensor_pattern/eval_list_size.cc b/lib/substitutions/src/substitutions/tensor_pattern/eval_list_size.cc
index d1e97adc37..5acfdf406a 100644
--- a/lib/substitutions/src/substitutions/tensor_pattern/eval_list_size.cc
+++ b/lib/substitutions/src/substitutions/tensor_pattern/eval_list_size.cc
@@ -1,5 +1,6 @@
 #include "substitutions/tensor_pattern/eval_list_size.h"
 #include "substitutions/tensor_pattern/get_attribute.h"
+#include "utils/nonnegative_int/num_elements.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
@@ -9,8 +10,8 @@ TensorAttributeValue eval_list_size(ParallelTensorAttrs const &attrs,
   TensorAttributeValue from_attr = get_attribute(attrs, acc.attribute_key);
 
   return from_attr.visit<TensorAttributeValue>(overload{
-      [](std::vector<int> const &v) -> TensorAttributeValue {
-        return TensorAttributeValue{v.size()};
+      [](std::vector<nonnegative_int> const &v) -> TensorAttributeValue {
+        return TensorAttributeValue{num_elements(v)};
       },
       [](auto &&) -> TensorAttributeValue {
         throw mk_runtime_error("Invalid operand");
diff --git a/lib/substitutions/src/substitutions/tensor_pattern/get_attribute.cc b/lib/substitutions/src/substitutions/tensor_pattern/get_attribute.cc
index 286bc69b84..3539b06832 100644
--- a/lib/substitutions/src/substitutions/tensor_pattern/get_attribute.cc
+++ b/lib/substitutions/src/substitutions/tensor_pattern/get_attribute.cc
@@ -10,15 +10,15 @@ TensorAttributeValue get_attribute(ParallelTensorAttrs const &attrs,
                                    TensorAttributeKey key) {
   switch (key) {
     case TensorAttributeKey::DIM_SIZES: {
-      std::vector<size_t> sizes =
-          transform(vector_of(ff_ordered_shard_dims(attrs.shape.dims)),
-                    [](ShardParallelDim const &d) { return d.size; });
+      std::vector<nonnegative_int> sizes = transform(
+          vector_of(ff_ordered_shard_dims(attrs.shape.dims)),
+          [](ShardParallelDim const &d) { return nonnegative_int{d.size}; });
       return TensorAttributeValue{sizes};
     }
     case TensorAttributeKey::DIM_DEGREES: {
-      std::vector<size_t> degrees = transform(
+      std::vector<nonnegative_int> degrees = transform(
           vector_of(ff_ordered_shard_dims(attrs.shape.dims)),
-          [](ShardParallelDim const &d) { return size_t_from_int(d.degree); });
+          [](ShardParallelDim const &d) { return nonnegative_int{d.degree}; });
       return TensorAttributeValue{degrees};
     }
     default:
diff --git a/lib/substitutions/src/substitutions/tensor_pattern/tensor_attribute_pattern.cc b/lib/substitutions/src/substitutions/tensor_pattern/tensor_attribute_pattern.cc
index 794ab5abda..e1c1fe7cf6 100644
--- a/lib/substitutions/src/substitutions/tensor_pattern/tensor_attribute_pattern.cc
+++ b/lib/substitutions/src/substitutions/tensor_pattern/tensor_attribute_pattern.cc
@@ -1,4 +1,5 @@
 #include "substitutions/tensor_pattern/tensor_attribute_pattern.h"
+#include "utils/integer_conversions.h"
 
 namespace FlexFlow {
 
@@ -6,4 +7,19 @@ TensorAttributePattern tensor_attribute_pattern_match_all() {
   return TensorAttributePattern{{}};
 }
 
+TensorAttributePattern
+    tensor_attr_pattern_require_num_dims(nonnegative_int num_dims) {
+  return TensorAttributePattern{{
+      TensorAttributeConstraint{
+          ConstraintType::EQUAL,
+          TensorAttributeExpr{
+              TensorAttributeListSize{
+                  TensorAttributeKey::DIM_SIZES,
+              },
+          },
+          TensorAttributeValue{num_dims},
+      },
+  }};
+}
+
 } // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/unity_substitution_set.cc b/lib/substitutions/src/substitutions/unity_substitution_set.cc
new file mode 100644
index 0000000000..4b00cdd95f
--- /dev/null
+++ b/lib/substitutions/src/substitutions/unity_substitution_set.cc
@@ -0,0 +1,235 @@
+#include "substitutions/unity_substitution_set.h"
+#include "pcg/machine_specification.h"
+#include "substitutions/operator_pattern/operator_attribute_constraint.h"
+#include "substitutions/output_graph/output_operator_attrs_assignment.h"
+#include "substitutions/substitution_builder.h"
+#include "substitutions/tensor_pattern/tensor_attribute_pattern.h"
+#include "utils/containers/get_only.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+
+namespace FlexFlow {
+
+std::vector<Substitution>
+    get_substitution_set(MachineSpecification const &resources) {
+  std::vector<Substitution> substitutions;
+  for (nonnegative_int num_dims :
+       nonnegative_range(1_n, nonnegative_int{MAX_TENSOR_DIM})) {
+    for (nonnegative_int degree = 1_n; degree <= get_num_gpus(resources);
+         degree *= 2_n) {
+      substitutions.push_back(
+          create_replicate_linear_combine(num_dims, degree, true));
+      substitutions.push_back(
+          create_replicate_linear_combine(num_dims, degree, false));
+    }
+  }
+  substitutions.push_back(create_fuse_linear_activation(Activation::RELU));
+  substitutions.push_back(create_fuse_linear_activation(Activation::SIGMOID));
+  substitutions.push_back(create_fuse_linear_activation(Activation::TANH));
+  substitutions.push_back(create_fuse_linear_activation(Activation::GELU));
+  return substitutions;
+}
+
+Substitution create_combine_inception(nonnegative_int num_convs,
+                                      nonnegative_int num_dims,
+                                      nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_combine_concat(nonnegative_int num_inputs,
+                                   nonnegative_int num_dims,
+                                   nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_replicate_linear_combine(nonnegative_int num_dims,
+                                             nonnegative_int degree,
+                                             bool use_bias) {
+  SubstitutionBuilder b;
+
+  auto [p_input, o_input] = b.add_input(tensor_attribute_pattern_match_all());
+  auto [p_weight, o_weight] = b.add_input(tensor_attribute_pattern_match_all());
+  std::vector<PatternValue> p_inputs = {p_input, p_weight};
+
+  std::optional<OutputGraphExprValue> o_bias = std::nullopt;
+  if (use_bias) {
+    std::pair<PatternValue, OutputGraphExprValue> bias =
+        b.add_input(tensor_attribute_pattern_match_all());
+    p_inputs.push_back(bias.first);
+    o_bias = bias.second;
+  }
+
+  OperatorAttributePattern linear_pattern = OperatorAttributePattern{{
+      op_type_equals_constraint(OperatorType::LINEAR),
+      op_attr_key_equals(OperatorAttributeKey::BIAS,
+                         OperatorAttributeValue{use_bias}),
+      op_attr_key_divisible_by(OperatorAttributeKey::OUT_CHANNELS,
+                               nonnegative_int{degree}),
+  }};
+
+  PatternValue p_linear_output = get_only(b.add_pattern_node(
+      linear_pattern,
+      p_inputs,
+      {tensor_attr_pattern_require_num_dims(nonnegative_int{num_dims})},
+      "linear"));
+
+  OutputOperatorAttrsAssignment replicate_input_expr =
+      OutputOperatorAttrsAssignment{
+          std::nullopt,
+          {
+              set_op_type_attr(OperatorType::REPLICATE),
+              set_attr_to_constant(OperatorAttributeKey::PARALLEL_DEGREE,
+                                   OperatorAttributeValue{degree}),
+          }};
+  OutputGraphExprValue o_replicate_input_output =
+      get_only(b.add_output_graph_node(replicate_input_expr, {o_input}, 1_n));
+
+  OutputOperatorAttrsAssignment partition_weights_expr =
+      OutputOperatorAttrsAssignment{
+          std::nullopt,
+          {
+              set_op_type_attr(OperatorType::REPARTITION),
+              set_attr_to_constant(OperatorAttributeKey::PARALLEL_DEGREE,
+                                   OperatorAttributeValue{degree}),
+              set_attr_to_constant(OperatorAttributeKey::PARALLEL_DIM,
+                                   OperatorAttributeValue{ff_dim_t{1_n}}),
+          }};
+  OutputGraphExprValue o_partition_weights_output = get_only(
+      b.add_output_graph_node(partition_weights_expr, {o_weight}, 1_n));
+
+  std::vector<OutputGraphExprValue> o_linear_inputs = {
+      o_replicate_input_output, o_partition_weights_output};
+
+  if (use_bias) {
+    OutputOperatorAttrsAssignment partition_bias_expr =
+        OutputOperatorAttrsAssignment{
+            std::nullopt,
+            {
+                set_op_type_attr(OperatorType::REPARTITION),
+                set_attr_to_constant(OperatorAttributeKey::PARALLEL_DEGREE,
+                                     OperatorAttributeValue{degree}),
+                set_attr_to_constant(OperatorAttributeKey::PARALLEL_DIM,
+                                     OperatorAttributeValue{ff_dim_t{1_n}}),
+            }};
+    OutputGraphExprValue o_partition_bias_output = get_only(
+        b.add_output_graph_node(partition_bias_expr, {o_bias.value()}, 1_n));
+    o_linear_inputs.push_back(o_partition_bias_output);
+  }
+
+  OutputOperatorAttrsAssignment linear_expr = OutputOperatorAttrsAssignment{
+      b.pattern_node_named("linear"),
+      {},
+  };
+  OutputGraphExprValue o_linear_output =
+      get_only(b.add_output_graph_node(linear_expr, o_linear_inputs, 1_n));
+
+  OutputOperatorAttrsAssignment combine_expr = OutputOperatorAttrsAssignment{
+      std::nullopt,
+      {
+          set_op_type_attr(OperatorType::COMBINE),
+          set_attr_to_constant(OperatorAttributeKey::PARALLEL_DEGREE,
+                               OperatorAttributeValue{degree}),
+          set_attr_to_constant(
+              OperatorAttributeKey::PARALLEL_DIM,
+              OperatorAttributeValue{ff_dim_t{
+                  nonnegative_int{num_dims.unwrap_nonnegative() - 1},
+              }}),
+      },
+  };
+  OutputGraphExprValue o_combine_output =
+      get_only(b.add_output_graph_node(combine_expr, {o_linear_output}, 1_n));
+
+  b.equate_outputs(p_linear_output, o_combine_output);
+
+  return b.get_substitution();
+}
+
+Substitution create_partition_linear_combine(nonnegative_int num_dims,
+                                             nonnegative_int degree,
+                                             Activation activation,
+                                             bool use_bias) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_partition_conv2d_combine(nonnegative_int num_dims,
+                                             nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_partition_attention_combine(nonnegative_int num_heads,
+                                                nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_replicate_attention_reduce(nonnegative_int num_heads,
+                                               nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_partition_add_combine(ff_dim_t parallel_dim,
+                                          nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_partition_relu_combine(ff_dim_t parallel_dim,
+                                           nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_partition_concat_combine(nonnegative_int num_inputs,
+                                             ff_dim_t concat_dim,
+                                             ff_dim_t parallel_dim,
+                                             nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_partition_softmax_combine(ff_dim_t softmax_dim,
+                                              ff_dim_t partition_dim,
+                                              nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_fuse_linear_activation(Activation activation) {
+  SubstitutionBuilder b;
+
+  auto [p_input, o_input] =
+      b.add_input(tensor_attribute_pattern_match_all(), "input");
+  auto [p_weight, o_weight] =
+      b.add_input(tensor_attribute_pattern_match_all(), "weight");
+
+  OperatorAttributePattern mm_pattern = OperatorAttributePattern{{
+      op_type_equals_constraint(OperatorType::LINEAR),
+      op_attr_key_equals(
+          OperatorAttributeKey::ACTIVATION,
+          OperatorAttributeValue{std::optional<Activation>{std::nullopt}}),
+  }};
+  PatternValue p_mm_output =
+      get_only(b.add_pattern_node(mm_pattern,
+                                  {p_input, p_weight},
+                                  {tensor_attribute_pattern_match_all()},
+                                  "mm"));
+
+  OperatorAttributePattern relu_pattern = OperatorAttributePattern{{
+      op_type_equals_constraint(OperatorType::RELU),
+  }};
+  PatternValue p_relu_output =
+      get_only(b.add_pattern_node(relu_pattern,
+                                  {p_mm_output},
+                                  {tensor_attribute_pattern_match_all()},
+                                  "relu"));
+
+  OutputOperatorAttrsAssignment fused_node_expr = OutputOperatorAttrsAssignment{
+      b.pattern_node_named("mm"),
+      {
+          set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
+                               OperatorAttributeValue{activation}),
+      }};
+  OutputGraphExprValue o_fused_node_output = get_only(
+      b.add_output_graph_node(fused_node_expr, {o_input, o_weight}, 1_n));
+
+  b.equate_outputs(p_relu_output, o_fused_node_output);
+
+  return b.get_substitution();
+}
+
+} // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/unlabelled/input_pattern_edge.cc b/lib/substitutions/src/substitutions/unlabelled/input_pattern_edge.cc
index e8deacebec..dff600ecf0 100644
--- a/lib/substitutions/src/substitutions/unlabelled/input_pattern_edge.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/input_pattern_edge.cc
@@ -11,7 +11,7 @@ PatternNode get_dst_node(InputPatternEdge const &e) {
   return PatternNode{e.raw_edge.dst.node};
 }
 
-int get_dst_idx(InputPatternEdge const &e) {
+nonnegative_int get_dst_idx(InputPatternEdge const &e) {
   return e.raw_edge.dst.idx;
 }
 
diff --git a/lib/substitutions/src/substitutions/unlabelled/pattern_node_output.cc b/lib/substitutions/src/substitutions/unlabelled/pattern_node_output.cc
index 9abdc4e83c..24bbb6f4d1 100644
--- a/lib/substitutions/src/substitutions/unlabelled/pattern_node_output.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/pattern_node_output.cc
@@ -6,7 +6,7 @@ PatternNode get_src_node(PatternNodeOutput const &o) {
   return PatternNode{o.raw_dataflow_output.node};
 }
 
-int get_idx(PatternNodeOutput const &o) {
+nonnegative_int get_idx(PatternNodeOutput const &o) {
   return o.raw_dataflow_output.idx;
 }
 
diff --git a/lib/substitutions/src/substitutions/unlabelled/standard_pattern_edge.cc b/lib/substitutions/src/substitutions/unlabelled/standard_pattern_edge.cc
index dea3e5f500..17d05f1122 100644
--- a/lib/substitutions/src/substitutions/unlabelled/standard_pattern_edge.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/standard_pattern_edge.cc
@@ -10,11 +10,11 @@ PatternNode get_dst_node(StandardPatternEdge const &e) {
   return PatternNode{e.raw_edge.dst.node};
 }
 
-int get_src_idx(StandardPatternEdge const &e) {
+nonnegative_int get_src_idx(StandardPatternEdge const &e) {
   return e.raw_edge.src.idx;
 }
 
-int get_dst_idx(StandardPatternEdge const &e) {
+nonnegative_int get_dst_idx(StandardPatternEdge const &e) {
   return e.raw_edge.dst.idx;
 }
 
diff --git a/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc b/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc
new file mode 100644
index 0000000000..5fd923f71f
--- /dev/null
+++ b/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc
@@ -0,0 +1,174 @@
+#include "substitutions/apply_substitution/apply_substitution.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "substitutions/operator_pattern/operator_attribute_constraint.h"
+#include "substitutions/output_graph/output_operator_attrs_assignment.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/substitution_builder.h"
+#include "substitutions/tensor_pattern/tensor_attribute_pattern.h"
+#include "utils/containers/get_only.h"
+#include "utils/integer_conversions.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("apply_substitution") {
+    SubstitutionBuilder b;
+
+    auto [p_input, o_input] =
+        b.add_input(tensor_attribute_pattern_match_all(), "input");
+    auto [p_weight, o_weight] =
+        b.add_input(tensor_attribute_pattern_match_all(), "weight");
+
+    PatternValue p_mm_output = [&] {
+      auto pattern = OperatorAttributePattern{{
+          op_type_equals_constraint(OperatorType::LINEAR),
+          op_attr_key_equals(
+              OperatorAttributeKey::ACTIVATION,
+              OperatorAttributeValue{std::optional<Activation>{std::nullopt}}),
+      }};
+
+      return get_only(b.add_pattern_node(pattern,
+                                         {p_input, p_weight},
+                                         {tensor_attribute_pattern_match_all()},
+                                         "mm"));
+    }();
+
+    PatternValue p_relu_output = [&] {
+      auto pattern = OperatorAttributePattern{{
+          op_type_equals_constraint(OperatorType::RELU),
+      }};
+
+      return get_only(b.add_pattern_node(pattern,
+                                         {p_mm_output},
+                                         {tensor_attribute_pattern_match_all()},
+                                         "relu"));
+    }();
+
+    OutputGraphExprValue o_fused_output = [&] {
+      auto node_expr = OutputOperatorAttrsAssignment{
+          b.pattern_node_named("mm"),
+          {
+              set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
+                                   OperatorAttributeValue{Activation::RELU}),
+          }};
+
+      return get_only(
+          b.add_output_graph_node(node_expr, {o_input, o_weight}, 1_n));
+    }();
+
+    b.equate_outputs(p_relu_output, o_fused_output);
+
+    Substitution sub = b.get_substitution();
+
+    nonnegative_int in_channels = 24_n;
+    nonnegative_int batch_size = 4_n;
+    nonnegative_int batch_degree = 2_n;
+    std::string mm_match = "mm_match";
+    std::string relu_match = "relu_match";
+
+    SubParallelComputationGraph pcg = [&] {
+      ParallelComputationGraphBuilder b;
+      parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{
+          ParallelTensorDims{
+              FFOrdered<ShardParallelDim>{
+                  ShardParallelDim{batch_size, batch_degree},
+                  ShardParallelDim{in_channels, 1_n},
+              },
+              ReplicaParallelDimSet{
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
+              },
+          },
+          DataType::FLOAT,
+      });
+      t = b.dense(t,
+                  /*outDim=*/16_n,
+                  /*activation=*/std::nullopt);
+      t = b.gelu(t);
+      t = b.dense(t,
+                  /*outDim=*/12_n,
+                  /*activation=*/std::nullopt,
+                  /*use_bias=*/false,
+                  /*data_type=*/DataType::FLOAT,
+                  /*kernel_initializer=*/std::nullopt,
+                  /*bias_initializer=*/std::nullopt,
+                  /*name=*/mm_match);
+      t = b.relu(t,
+                 /*name=*/relu_match);
+      t = b.dense(t,
+                  /*outDim=*/8_n,
+                  /*activation=*/Activation::RELU);
+
+      return sub_pcg_from_full_pcg(b.pcg);
+    }();
+
+    PCGPatternMatch match = [&] {
+      parallel_layer_guid_t mm_match_layer =
+          get_parallel_layer_by_name(pcg, mm_match);
+      parallel_layer_guid_t relu_match_layer =
+          get_parallel_layer_by_name(pcg, relu_match);
+      open_parallel_tensor_guid_t mm_match_layer_input_activations =
+          get_layer_inputs(pcg, mm_match_layer).at(0);
+      open_parallel_tensor_guid_t mm_match_layer_input_weights =
+          get_layer_inputs(pcg, mm_match_layer).at(1);
+
+      return PCGPatternMatch{
+          bidict<PatternNode, parallel_layer_guid_t>{
+              {b.pattern_node_named("mm"), mm_match_layer},
+              {b.pattern_node_named("relu"), relu_match_layer},
+          },
+          std::unordered_map<PatternInput, open_parallel_tensor_guid_t>{
+              {
+                  b.pattern_input_named("input"),
+                  mm_match_layer_input_activations,
+              },
+              {
+                  b.pattern_input_named("weight"),
+                  mm_match_layer_input_weights,
+              }},
+      };
+    }();
+
+    SubParallelComputationGraph result = apply_substitution(pcg, sub, match);
+
+    SubParallelComputationGraph correct = [&] {
+      ParallelComputationGraphBuilder b;
+      parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{
+          ParallelTensorDims{
+              FFOrdered<ShardParallelDim>{
+                  ShardParallelDim{batch_size, batch_degree},
+                  ShardParallelDim{in_channels, 1_n},
+              },
+              ReplicaParallelDimSet{
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
+              },
+          },
+          DataType::FLOAT,
+      });
+      t = b.dense(t,
+                  /*outDim=*/16_n,
+                  /*activation=*/std::nullopt);
+      t = b.gelu(t);
+      t = b.dense(t,
+                  /*outDim=*/12_n,
+                  /*activation=*/Activation::RELU,
+                  /*use_bias=*/false,
+                  /*data_type=*/DataType::FLOAT,
+                  /*kernel_initializer=*/std::nullopt,
+                  /*bias_initializer=*/std::nullopt,
+                  /*name=*/std::nullopt);
+      t = b.dense(t,
+                  /*outDim=*/8_n,
+                  /*activation=*/Activation::RELU);
+
+      return sub_pcg_from_full_pcg(b.pcg);
+    }();
+
+    // since the new nodes produced by the substitution have new ids, it's
+    // easier/more correct to check that the graphs are isomorphic rather than
+    // checking their exact graph data
+    CHECK(sub_pcgs_are_isomorphic(result, correct));
+  }
+}
diff --git a/lib/substitutions/test/src/substitutions/substitution_internal/evaluate_substitution_output.cc b/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc
similarity index 86%
rename from lib/substitutions/test/src/substitutions/substitution_internal/evaluate_substitution_output.cc
rename to lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc
index 52b54b32fb..7bdcc5a3bd 100644
--- a/lib/substitutions/test/src/substitutions/substitution_internal/evaluate_substitution_output.cc
+++ b/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc
@@ -1,4 +1,4 @@
-#include "substitutions/substitution_internal/evaluate_substitution_output.h"
+#include "substitutions/apply_substitution/evaluate_substitution_output.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
 #include "substitutions/open_parallel_tensor_guid_t.h"
 #include "substitutions/operator_pattern/operator_attribute_constraint.h"
@@ -64,20 +64,23 @@ TEST_SUITE(FF_TEST_SUITE) {
         OutputGraphExprInput{output_g.add_input({})};
 
     OutputOperatorAttrsAssignment fused_mm_relu_attrs_assignment =
-        OutputOperatorAttrsAssignment{{
-            set_attr_to_constant(OperatorAttributeKey::OP_TYPE,
-                                 OperatorAttributeValue{OperatorType::LINEAR}),
-            copy_attr_from_pattern_node(OperatorAttributeKey::OUT_CHANNELS,
-                                        pattern_mm_node),
-            copy_attr_from_pattern_node(OperatorAttributeKey::USE_BIAS,
-                                        pattern_mm_node),
-            copy_attr_from_pattern_node(OperatorAttributeKey::DATA_TYPE,
-                                        pattern_mm_node),
-            set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
-                                 OperatorAttributeValue{Activation::RELU}),
-            copy_attr_from_pattern_node(OperatorAttributeKey::REGULARIZER,
-                                        pattern_mm_node),
-        }};
+        OutputOperatorAttrsAssignment{
+            std::nullopt,
+            {
+                set_attr_to_constant(
+                    OperatorAttributeKey::OP_TYPE,
+                    OperatorAttributeValue{OperatorType::LINEAR}),
+                copy_attr_from_pattern_node(OperatorAttributeKey::OUT_CHANNELS,
+                                            pattern_mm_node),
+                copy_attr_from_pattern_node(OperatorAttributeKey::USE_BIAS,
+                                            pattern_mm_node),
+                copy_attr_from_pattern_node(OperatorAttributeKey::DATA_TYPE,
+                                            pattern_mm_node),
+                set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
+                                     OperatorAttributeValue{Activation::RELU}),
+                copy_attr_from_pattern_node(OperatorAttributeKey::REGULARIZER,
+                                            pattern_mm_node),
+            }};
     NodeAddedResult fused_mm_relu_added = output_g.add_node(
         fused_mm_relu_attrs_assignment,
         {OpenDataflowValue{output_i_activation.raw_dataflow_graph_input},
@@ -108,9 +111,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    int in_channels = 24;
-    int batch_size = 4;
-    int batch_degree = 2;
+    nonnegative_int in_channels = 24_n;
+    nonnegative_int batch_size = 4_n;
+    nonnegative_int batch_degree = 2_n;
     std::string mm_match = "mm_match";
     std::string relu_match = "relu_match";
 
@@ -119,22 +122,22 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
-                  ShardParallelDim{size_t_from_int(batch_size), batch_degree},
-                  ShardParallelDim{size_t_from_int(in_channels), 1},
+                  ShardParallelDim{batch_size, batch_degree},
+                  ShardParallelDim{in_channels, 1_n},
               },
               ReplicaParallelDimSet{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
               },
           },
           DataType::FLOAT,
       });
       t = b.dense(t,
-                  /*outDim=*/16,
+                  /*outDim=*/16_n,
                   /*activation=*/std::nullopt);
       t = b.gelu(t);
       t = b.dense(t,
-                  /*outDim=*/12,
+                  /*outDim=*/12_n,
                   /*activation=*/std::nullopt,
                   /*use_bias=*/false,
                   /*data_type=*/DataType::FLOAT,
@@ -144,7 +147,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       t = b.relu(t,
                  /*name=*/relu_match);
       t = b.dense(t,
-                  /*outDim=*/8,
+                  /*outDim=*/8_n,
                   /*activation=*/Activation::RELU);
 
       return sub_pcg_from_full_pcg(b.pcg);
@@ -186,10 +189,10 @@ TEST_SUITE(FF_TEST_SUITE) {
           result_input_map = result.second.input_mapping;
 
       LinearAttrs correct_result_fused_mm_relu_attrs = LinearAttrs{
-          12,
+          /*out_channels=*/12_n,
           /*use_bias=*/false,
-          DataType::FLOAT,
-          Activation::RELU,
+          /*data_type=*/DataType::FLOAT,
+          /*activation=*/Activation::RELU,
           /*regularizer=*/std::nullopt,
       };
 
@@ -228,7 +231,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                               result_i_activation.raw_dataflow_graph_input,
                               DataflowInput{
                                   result_fused_mm_relu_node.raw_graph_node,
-                                  0,
+                                  0_n,
                               },
                           },
                       },
@@ -239,7 +242,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                               result_i_weights.raw_dataflow_graph_input,
                               DataflowInput{
                                   result_fused_mm_relu_node.raw_graph_node,
-                                  1,
+                                  1_n,
                               },
                           },
                       },
diff --git a/lib/substitutions/test/src/substitutions/substitution_internal/perform_shape_inference.cc b/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc
similarity index 78%
rename from lib/substitutions/test/src/substitutions/substitution_internal/perform_shape_inference.cc
rename to lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc
index 4d4e557fb8..950e833771 100644
--- a/lib/substitutions/test/src/substitutions/substitution_internal/perform_shape_inference.cc
+++ b/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc
@@ -1,4 +1,4 @@
-#include "substitutions/substitution_internal/perform_shape_inference.h"
+#include "substitutions/apply_substitution/perform_shape_inference.h"
 #include "op-attrs/ops/element_unary.h"
 #include "op-attrs/ops/linear.h"
 #include "op-attrs/parallel_tensor_shape.h"
@@ -18,21 +18,21 @@ TEST_SUITE(FF_TEST_SUITE) {
             UnorderedSetLabelledOpenDataflowGraph<ParallelLayerAttrs,
                                                   std::monostate>>();
 
-    int in_channels = 24;
-    int out_channels = 16;
-    int batch_size = 4;
-    int batch_degree = 2;
+    nonnegative_int in_channels = 24_n;
+    nonnegative_int out_channels = 16_n;
+    nonnegative_int batch_size = 4_n;
+    nonnegative_int batch_degree = 2_n;
 
     DataflowGraphInput i0 = g.add_input({});
     ParallelTensorShape i0_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{size_t_from_int(batch_size), batch_degree},
-                ShardParallelDim{size_t_from_int(in_channels), 1},
+                ShardParallelDim{batch_size, batch_degree},
+                ShardParallelDim{in_channels, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -40,28 +40,28 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     bool use_bias = false;
     LinearAttrs n1_op_attrs = LinearAttrs{
-        out_channels,
-        use_bias,
-        DataType::FLOAT,
-        std::nullopt,
-        std::nullopt,
+        /*out_channels=*/out_channels,
+        /*use_bias=*/use_bias,
+        /*data_type=*/DataType::FLOAT,
+        /*activation=*/std::nullopt,
+        /*regularizer=*/std::nullopt,
     };
     ParallelLayerAttrs n1_attrs = ParallelLayerAttrs{
-        PCGOperatorAttrs{
+        /*op_attrs=*/PCGOperatorAttrs{
             n1_op_attrs,
         },
-        std::nullopt,
+        /*name=*/std::nullopt,
     };
 
     ElementUnaryAttrs n2_op_attrs = ElementUnaryAttrs{
-        OperatorType::RELU,
-        std::nullopt,
+        /*op_type=*/OperatorType::RELU,
+        /*scalar=*/std::nullopt,
     };
     ParallelLayerAttrs n2_attrs = ParallelLayerAttrs{
-        PCGOperatorAttrs{
+        /*op_attrs=*/PCGOperatorAttrs{
             n2_op_attrs,
         },
-        std::nullopt,
+        /*name=*/std::nullopt,
     };
 
     ParallelTensorShape n1_output_shape =
@@ -131,22 +131,22 @@ TEST_SUITE(FF_TEST_SUITE) {
                 OpenDataflowEdge{
                     DataflowInputEdge{
                         i0,
-                        DataflowInput{n1, 0},
+                        DataflowInput{n1, 0_n},
                     },
                 },
                 OpenDataflowEdge{DataflowEdge{
-                    DataflowOutput{n1_weight_node, 0},
-                    DataflowInput{n1_weight_replicate_node, 0},
+                    DataflowOutput{n1_weight_node, 0_n},
+                    DataflowInput{n1_weight_replicate_node, 0_n},
                 }},
                 OpenDataflowEdge{
                     DataflowEdge{
-                        DataflowOutput{n1_weight_replicate_node, 0},
-                        DataflowInput{n1, 1},
+                        DataflowOutput{n1_weight_replicate_node, 0_n},
+                        DataflowInput{n1, 1_n},
                     },
                 },
                 OpenDataflowEdge{DataflowEdge{
-                    DataflowOutput{n1, 0},
-                    DataflowInput{n2, 0},
+                    DataflowOutput{n1, 0_n},
+                    DataflowInput{n2, 0_n},
                 }},
             },
             {i0},
@@ -155,19 +155,20 @@ TEST_SUITE(FF_TEST_SUITE) {
                  i0_shape,
              },
              {
-                 OpenDataflowValue{DataflowOutput{n1_weight_node, 0}},
+                 OpenDataflowValue{DataflowOutput{n1_weight_node, 0_n}},
                  lift_to_parallel(get_reduced_shape(n1_weight_shape)),
              },
              {
-                 OpenDataflowValue{DataflowOutput{n1_weight_replicate_node, 0}},
+                 OpenDataflowValue{
+                     DataflowOutput{n1_weight_replicate_node, 0_n}},
                  n1_weight_shape,
              },
              {
-                 OpenDataflowValue{DataflowOutput{n1, 0}},
+                 OpenDataflowValue{DataflowOutput{n1, 0_n}},
                  n1_output_shape,
              },
              {
-                 OpenDataflowValue{DataflowOutput{n2, 0}},
+                 OpenDataflowValue{DataflowOutput{n2, 0_n}},
                  n2_output_shape,
              }}};
 
diff --git a/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc b/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc
index 95b61e0ef4..24f9e9bd56 100644
--- a/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc
+++ b/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc
@@ -6,7 +6,7 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_attribute(LinearAttrs, OperatorAttributeKey)") {
-    int out_channels = 16;
+    nonnegative_int out_channels = 16_n;
     bool use_bias = true;
     std::optional<Activation> activation = Activation::GELU;
     std::optional<RegularizerAttrs> regularizer = RegularizerAttrs{
diff --git a/lib/substitutions/test/src/substitutions/pcg_pattern.cc b/lib/substitutions/test/src/substitutions/pcg_pattern.cc
index d9273b4bcf..9ff368a8eb 100644
--- a/lib/substitutions/test/src/substitutions/pcg_pattern.cc
+++ b/lib/substitutions/test/src/substitutions/pcg_pattern.cc
@@ -15,19 +15,19 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("find_pattern_matches(PCGPattern, SubParallelComputationGraph)") {
     ParallelComputationGraphBuilder builder;
 
-    size_t batch_size = 16;
-    int batch_degree = 2;
-    size_t num_channels = 24;
+    nonnegative_int batch_size = 16_n;
+    nonnegative_int batch_degree = 2_n;
+    nonnegative_int num_channels = 24_n;
 
     ParallelTensorShape a_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
                 ShardParallelDim{batch_size, batch_degree},
-                ShardParallelDim{num_channels, 1},
+                ShardParallelDim{num_channels, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -37,7 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     parallel_tensor_guid_t a_tensor =
         builder.create_input_tensor(a_shape, CreateGrad::YES, a_name);
 
-    int outDim = 16;
+    nonnegative_int outDim = 16_n;
     std::string x_matmul_name = "x_matmul";
     std::string y_matmul_name = "y_matmul";
     parallel_tensor_guid_t t0 =
diff --git a/lib/substitutions/test/src/substitutions/substitution.cc b/lib/substitutions/test/src/substitutions/substitution.cc
index 1718b03b5c..ef27cb7606 100644
--- a/lib/substitutions/test/src/substitutions/substitution.cc
+++ b/lib/substitutions/test/src/substitutions/substitution.cc
@@ -4,226 +4,173 @@
 #include "substitutions/operator_pattern/operator_attribute_constraint.h"
 #include "substitutions/output_graph/output_graph_expr_node.dtg.h"
 #include "substitutions/output_graph/output_operator_attrs_assignment.h"
+#include "substitutions/pcg_pattern.h"
 #include "substitutions/pcg_pattern_builder.h"
 #include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/substitution_builder.h"
 #include "substitutions/tensor_pattern/tensor_attribute_pattern.h"
 #include "utils/containers/get_only.h"
 #include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h"
 #include "utils/graph/labelled_open_dataflow_graph/algorithms/get_graph_data.h"
+#include "utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h"
 #include "utils/integer_conversions.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  // TEST_CASE("is_valid_substitution") {
-  //   FAIL("TODO");
-  // }
-
-  TEST_CASE("evaluate_substitution_output(SubParallelComputationGraph, "
-            "Substitution, PCGPatternMatch)") {
-    // Currently Substitution creation is very verbose.
-    // This is being addressed in
-    // https://github.com/flexflow/FlexFlow/issues/1473.
-    auto pattern_g = LabelledOpenDataflowGraph<OperatorAttributePattern,
-                                               TensorAttributePattern>::
-        create<UnorderedSetLabelledOpenDataflowGraph<OperatorAttributePattern,
-                                                     TensorAttributePattern>>();
-
-    PatternInput pattern_i_activation =
-        PatternInput{pattern_g.add_input(tensor_attribute_pattern_match_all())};
-    PatternInput pattern_i_weights =
-        PatternInput{pattern_g.add_input(tensor_attribute_pattern_match_all())};
-
-    OperatorAttributePattern mm_pattern = OperatorAttributePattern{{
-        op_type_equals_constraint(OperatorType::LINEAR),
-        op_attr_key_equals(
-            OperatorAttributeKey::ACTIVATION,
-            OperatorAttributeValue{std::optional<Activation>{std::nullopt}}),
-    }};
-    NodeAddedResult mm_added = pattern_g.add_node(
-        mm_pattern,
-        {OpenDataflowValue{pattern_i_activation.raw_dataflow_graph_input},
-         OpenDataflowValue{pattern_i_weights.raw_dataflow_graph_input}},
-        {tensor_attribute_pattern_match_all()});
-    PatternNode pattern_mm_node = PatternNode{mm_added.node};
-    DataflowOutput mm_output = get_only(mm_added.outputs);
-
-    OperatorAttributePattern relu_pattern = OperatorAttributePattern{{
-        op_type_equals_constraint(OperatorType::RELU),
-    }};
-    NodeAddedResult relu_added =
-        pattern_g.add_node(relu_pattern,
-                           {OpenDataflowValue{mm_output}},
-                           {tensor_attribute_pattern_match_all()});
-    PatternNode pattern_relu_node = PatternNode{relu_added.node};
-    DataflowOutput relu_output = get_only(relu_added.outputs);
-
-    LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment, std::monostate>
-        output_g = LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment,
-                                             std::monostate>::
-            create<UnorderedSetLabelledOpenDataflowGraph<
-                OutputOperatorAttrsAssignment,
-                std::monostate>>();
-
-    OutputGraphExprInput output_i_activation =
-        OutputGraphExprInput{output_g.add_input({})};
-    OutputGraphExprInput output_i_weights =
-        OutputGraphExprInput{output_g.add_input({})};
-
-    OutputOperatorAttrsAssignment fused_mm_relu_attrs_assignment =
-        OutputOperatorAttrsAssignment{{
-            set_attr_to_constant(OperatorAttributeKey::OP_TYPE,
-                                 OperatorAttributeValue{OperatorType::LINEAR}),
-            copy_attr_from_pattern_node(OperatorAttributeKey::OUT_CHANNELS,
-                                        pattern_mm_node),
-            copy_attr_from_pattern_node(OperatorAttributeKey::USE_BIAS,
-                                        pattern_mm_node),
-            copy_attr_from_pattern_node(OperatorAttributeKey::DATA_TYPE,
-                                        pattern_mm_node),
-            set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
-                                 OperatorAttributeValue{Activation::RELU}),
-            copy_attr_from_pattern_node(OperatorAttributeKey::REGULARIZER,
-                                        pattern_mm_node),
+  TEST_CASE("is_isomorphic_to(Substitution, Substitution)") {
+    auto make_substitution = [] {
+      SubstitutionBuilder b;
+
+      auto [p_input, o_input] =
+          b.add_input(tensor_attribute_pattern_match_all());
+      auto [p_weight, o_weight] =
+          b.add_input(tensor_attribute_pattern_match_all());
+
+      PatternValue p_mm_output = [&] {
+        auto pattern = OperatorAttributePattern{{
+            op_type_equals_constraint(OperatorType::LINEAR),
+            op_attr_key_equals(OperatorAttributeKey::ACTIVATION,
+                               OperatorAttributeValue{
+                                   std::optional<Activation>{std::nullopt}}),
         }};
-    NodeAddedResult fused_mm_relu_added = output_g.add_node(
-        fused_mm_relu_attrs_assignment,
-        {OpenDataflowValue{output_i_activation.raw_dataflow_graph_input},
-         OpenDataflowValue{output_i_weights.raw_dataflow_graph_input}},
-        {{}});
-    OutputGraphExprNode fused_mm_relu_node =
-        OutputGraphExprNode{fused_mm_relu_added.node};
-    DataflowOutput fused_mm_relu_output = get_only(fused_mm_relu_added.outputs);
-
-    Substitution sub = Substitution{
-        PCGPattern{pattern_g},
-        OutputGraphExpr{output_g},
-        bidict<PatternInput, OutputGraphExprInput>{
-            {
-                pattern_i_activation,
-                output_i_activation,
-            },
-            {
-                pattern_i_weights,
-                output_i_weights,
-            },
-        },
-        bidict<PatternNodeOutput, OutputGraphExprNodeOutput>{
+
+        return get_only(
+            b.add_pattern_node(pattern,
+                               {p_input, p_weight},
+                               {tensor_attribute_pattern_match_all()},
+                               "mm"));
+      }();
+
+      PatternValue p_relu_output = [&] {
+        auto pattern = OperatorAttributePattern{{
+            op_type_equals_constraint(OperatorType::RELU),
+        }};
+
+        return get_only(
+            b.add_pattern_node(pattern,
+                               {p_mm_output},
+                               {tensor_attribute_pattern_match_all()},
+                               "relu"));
+      }();
+
+      OutputGraphExprValue o_fused_output = [&] {
+        auto node_expr = OutputOperatorAttrsAssignment{
+            b.pattern_node_named("mm"),
             {
-                PatternNodeOutput{relu_output},
-                OutputGraphExprNodeOutput{fused_mm_relu_output},
-            },
-        },
+                set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
+                                     OperatorAttributeValue{Activation::RELU}),
+            }};
+
+        return get_only(b.add_output_graph_node(
+            node_expr, {o_input, o_weight}, nonnegative_int{1}));
+      }();
+
+      b.equate_outputs(p_relu_output, o_fused_output);
+
+      return b.get_substitution();
     };
 
-    int in_channels = 24;
-    int batch_size = 4;
-    int batch_degree = 2;
-    std::string mm_match = "mm_match";
-    std::string relu_match = "relu_match";
-
-    SubParallelComputationGraph pcg = [&] {
-      ParallelComputationGraphBuilder b;
-      parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{
-          ParallelTensorDims{
-              FFOrdered<ShardParallelDim>{
-                  ShardParallelDim{size_t_from_int(batch_size), batch_degree},
-                  ShardParallelDim{size_t_from_int(in_channels), 1},
-              },
-              ReplicaParallelDimSet{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
-              },
-          },
-          DataType::FLOAT,
-      });
-      t = b.dense(t,
-                  /*outDim=*/16,
-                  /*activation=*/std::nullopt);
-      t = b.gelu(t);
-      t = b.dense(t,
-                  /*outDim=*/12,
-                  /*activation=*/std::nullopt,
-                  /*use_bias=*/false,
-                  /*data_type=*/DataType::FLOAT,
-                  /*kernel_initializer=*/std::nullopt,
-                  /*bias_initializer=*/std::nullopt,
-                  /*name=*/mm_match);
-      t = b.relu(t,
-                 /*name=*/relu_match);
-      t = b.dense(t,
-                  /*outDim=*/8,
-                  /*activation=*/Activation::RELU);
-
-      return sub_pcg_from_full_pcg(b.pcg);
+    Substitution sub1 = make_substitution();
+    Substitution sub2 = make_substitution();
+
+    CHECK(is_isomorphic_to(sub1, sub1));
+    CHECK(is_isomorphic_to(sub1, sub2));
+  }
+
+  TEST_CASE("is_valid_substitution") {
+    SubstitutionBuilder b;
+
+    auto [p_input, o_input] = b.add_input(tensor_attribute_pattern_match_all());
+    auto [p_weight, o_weight] =
+        b.add_input(tensor_attribute_pattern_match_all());
+
+    PatternValue p_mm_output = [&] {
+      auto pattern = OperatorAttributePattern{{
+          op_type_equals_constraint(OperatorType::LINEAR),
+          op_attr_key_equals(
+              OperatorAttributeKey::ACTIVATION,
+              OperatorAttributeValue{std::optional<Activation>{std::nullopt}}),
+      }};
+
+      return get_only(b.add_pattern_node(pattern,
+                                         {p_input, p_weight},
+                                         {tensor_attribute_pattern_match_all()},
+                                         "mm"));
     }();
 
-    PCGPatternMatch match = [&] {
-      parallel_layer_guid_t mm_match_layer =
-          get_parallel_layer_by_name(pcg, mm_match);
-      parallel_layer_guid_t relu_match_layer =
-          get_parallel_layer_by_name(pcg, relu_match);
-      open_parallel_tensor_guid_t mm_match_layer_input_activations =
-          get_layer_inputs(pcg, mm_match_layer).at(0);
-      open_parallel_tensor_guid_t mm_match_layer_input_weights =
-          get_layer_inputs(pcg, mm_match_layer).at(1);
-
-      return PCGPatternMatch{
-          bidict<PatternNode, parallel_layer_guid_t>{
-              {pattern_mm_node, mm_match_layer},
-              {pattern_relu_node, relu_match_layer},
-          },
-          std::unordered_map<PatternInput, open_parallel_tensor_guid_t>{
-              {
-                  PatternInput{pattern_i_activation},
-                  mm_match_layer_input_activations,
-              },
-              {
-                  PatternInput{pattern_i_weights},
-                  mm_match_layer_input_weights,
-              }},
-      };
+    PatternValue p_relu_output = [&] {
+      auto pattern = OperatorAttributePattern{{
+          op_type_equals_constraint(OperatorType::RELU),
+      }};
+
+      return get_only(b.add_pattern_node(pattern,
+                                         {p_mm_output},
+                                         {tensor_attribute_pattern_match_all()},
+                                         "relu"));
     }();
 
-    SubParallelComputationGraph result = apply_substitution(pcg, sub, match);
-
-    SubParallelComputationGraph correct = [&] {
-      ParallelComputationGraphBuilder b;
-      parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{
-          ParallelTensorDims{
-              FFOrdered<ShardParallelDim>{
-                  ShardParallelDim{size_t_from_int(batch_size), batch_degree},
-                  ShardParallelDim{size_t_from_int(in_channels), 1},
-              },
-              ReplicaParallelDimSet{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
-              },
-          },
-          DataType::FLOAT,
-      });
-      t = b.dense(t,
-                  /*outDim=*/16,
-                  /*activation=*/std::nullopt);
-      t = b.gelu(t);
-      t = b.dense(t,
-                  /*outDim=*/12,
-                  /*activation=*/Activation::RELU,
-                  /*use_bias=*/false,
-                  /*data_type=*/DataType::FLOAT,
-                  /*kernel_initializer=*/std::nullopt,
-                  /*bias_initializer=*/std::nullopt,
-                  /*name=*/std::nullopt);
-      t = b.dense(t,
-                  /*outDim=*/8,
-                  /*activation=*/Activation::RELU);
-
-      return sub_pcg_from_full_pcg(b.pcg);
+    OutputGraphExprValue o_fused_output = [&] {
+      auto node_expr = OutputOperatorAttrsAssignment{
+          b.pattern_node_named("mm"),
+          {
+              set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
+                                   OperatorAttributeValue{Activation::RELU}),
+          }};
+
+      return get_only(b.add_output_graph_node(
+          node_expr, {o_input, o_weight}, nonnegative_int{1}));
     }();
 
-    // since the new nodes produced by the substitution have new ids, it's
-    // easier/more correct to check that the graphs are isomorphic rather than
-    // checking their exact graph data
-    CHECK(sub_pcgs_are_isomorphic(result, correct));
+    b.equate_outputs(p_relu_output, o_fused_output);
+
+    SUBCASE("pattern inputs != mapped inputs") {
+      Substitution sub = b.get_substitution();
+      sub.pcg_pattern.raw_graph.add_input(tensor_attribute_pattern_match_all());
+      CHECK_FALSE(is_valid_substitution(sub));
+    }
+
+    SUBCASE("output graph inputs != mapped inputs") {
+      Substitution sub = b.get_substitution();
+      sub.output_graph_expr.raw_graph.add_input(std::monostate{});
+      CHECK_FALSE(is_valid_substitution(sub));
+    }
+
+    SUBCASE("pattern has no nodes") {
+      // Could revamp this test to only trigger the
+      // get_nodes(sub.pcg_pattern).empty() case
+      Substitution sub = b.get_substitution();
+      LabelledOpenDataflowGraph<OperatorAttributePattern,
+                                TensorAttributePattern>
+          zero_node_pattern =
+              LabelledOpenDataflowGraph<OperatorAttributePattern,
+                                        TensorAttributePattern>::
+                  create<UnorderedSetLabelledOpenDataflowGraph<
+                      OperatorAttributePattern,
+                      TensorAttributePattern>>();
+      sub.pcg_pattern = PCGPattern{zero_node_pattern};
+      CHECK_FALSE(is_valid_substitution(sub));
+    }
+
+    SUBCASE("output graph has no nodes") {
+      // Could revamp this test to only trigger the
+      // get_nodes(sub.output_graph_expr).empty() case
+      Substitution sub = b.get_substitution();
+      LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment, std::monostate>
+          zero_node_pattern =
+              LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment,
+                                        std::monostate>::
+                  create<UnorderedSetLabelledOpenDataflowGraph<
+                      OutputOperatorAttrsAssignment,
+                      std::monostate>>();
+      sub.output_graph_expr = OutputGraphExpr{zero_node_pattern};
+      CHECK_FALSE(is_valid_substitution(sub));
+    }
+
+    SUBCASE("valid substitution") {
+      Substitution sub = b.get_substitution();
+      CHECK(is_valid_substitution(sub));
+    }
   }
 }
diff --git a/lib/substitutions/test/src/substitutions/substitution_builder.cc b/lib/substitutions/test/src/substitutions/substitution_builder.cc
new file mode 100644
index 0000000000..028a4e59c9
--- /dev/null
+++ b/lib/substitutions/test/src/substitutions/substitution_builder.cc
@@ -0,0 +1,145 @@
+#include "substitutions/substitution_builder.h"
+#include "substitutions/operator_pattern/operator_attribute_constraint.h"
+#include "substitutions/output_graph/output_graph_expr_node.dtg.h"
+#include "substitutions/output_graph/output_operator_attrs_assignment.h"
+#include "substitutions/substitution.h"
+#include "substitutions/tensor_pattern/tensor_attribute_pattern.h"
+#include "utils/containers/get_only.h"
+#include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("SubstitutionBuilder") {
+    OperatorAttributePattern relu_pattern = OperatorAttributePattern{{
+        op_type_equals_constraint(OperatorType::RELU),
+    }};
+
+    OperatorAttributePattern mm_pattern = OperatorAttributePattern{{
+        op_type_equals_constraint(OperatorType::LINEAR),
+        op_attr_key_equals(
+            OperatorAttributeKey::ACTIVATION,
+            OperatorAttributeValue{std::optional<Activation>{std::nullopt}}),
+    }};
+
+    std::unordered_map<OperatorAttributeKey, OutputOperatorAttributeExpr>
+        fused_mm_relu_attr_assignments = {
+            set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
+                                 OperatorAttributeValue{Activation::RELU}),
+        };
+
+    Substitution correct = [&] {
+      auto pattern_g = LabelledOpenDataflowGraph<OperatorAttributePattern,
+                                                 TensorAttributePattern>::
+          create<
+              UnorderedSetLabelledOpenDataflowGraph<OperatorAttributePattern,
+                                                    TensorAttributePattern>>();
+
+      PatternInput pattern_i_activation = PatternInput{
+          pattern_g.add_input(tensor_attribute_pattern_match_all())};
+      PatternInput pattern_i_weights = PatternInput{
+          pattern_g.add_input(tensor_attribute_pattern_match_all())};
+
+      NodeAddedResult mm_added = pattern_g.add_node(
+          mm_pattern,
+          {OpenDataflowValue{pattern_i_activation.raw_dataflow_graph_input},
+           OpenDataflowValue{pattern_i_weights.raw_dataflow_graph_input}},
+          {tensor_attribute_pattern_match_all()});
+      PatternNode pattern_mm_node = PatternNode{mm_added.node};
+      DataflowOutput mm_output = get_only(mm_added.outputs);
+
+      NodeAddedResult relu_added =
+          pattern_g.add_node(relu_pattern,
+                             {OpenDataflowValue{mm_output}},
+                             {tensor_attribute_pattern_match_all()});
+      PatternNode pattern_relu_node = PatternNode{relu_added.node};
+      DataflowOutput relu_output = get_only(relu_added.outputs);
+
+      LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment, std::monostate>
+          output_g = LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment,
+                                               std::monostate>::
+              create<UnorderedSetLabelledOpenDataflowGraph<
+                  OutputOperatorAttrsAssignment,
+                  std::monostate>>();
+
+      OutputGraphExprInput output_i_activation =
+          OutputGraphExprInput{output_g.add_input({})};
+      OutputGraphExprInput output_i_weights =
+          OutputGraphExprInput{output_g.add_input({})};
+
+      OutputOperatorAttrsAssignment fused_mm_relu_attrs_assignment =
+          OutputOperatorAttrsAssignment{
+              pattern_mm_node,
+              fused_mm_relu_attr_assignments,
+          };
+      NodeAddedResult fused_mm_relu_added = output_g.add_node(
+          fused_mm_relu_attrs_assignment,
+          {OpenDataflowValue{output_i_activation.raw_dataflow_graph_input},
+           OpenDataflowValue{output_i_weights.raw_dataflow_graph_input}},
+          {{}});
+      OutputGraphExprNode fused_mm_relu_node =
+          OutputGraphExprNode{fused_mm_relu_added.node};
+      DataflowOutput fused_mm_relu_output =
+          get_only(fused_mm_relu_added.outputs);
+
+      return Substitution{
+          PCGPattern{pattern_g},
+          OutputGraphExpr{output_g},
+          bidict<PatternInput, OutputGraphExprInput>{
+              {
+                  pattern_i_activation,
+                  output_i_activation,
+              },
+              {
+                  pattern_i_weights,
+                  output_i_weights,
+              },
+          },
+          bidict<PatternNodeOutput, OutputGraphExprNodeOutput>{
+              {
+                  PatternNodeOutput{relu_output},
+                  OutputGraphExprNodeOutput{fused_mm_relu_output},
+              },
+          },
+      };
+    }();
+
+    Substitution result = [&] {
+      SubstitutionBuilder b;
+
+      auto [p_input, o_input] =
+          b.add_input(tensor_attribute_pattern_match_all());
+      auto [p_weight, o_weight] =
+          b.add_input(tensor_attribute_pattern_match_all());
+
+      PatternValue p_mm_output =
+          get_only(b.add_pattern_node(mm_pattern,
+                                      {p_input, p_weight},
+                                      {tensor_attribute_pattern_match_all()},
+                                      "mm"));
+
+      PatternValue p_relu_output =
+          get_only(b.add_pattern_node(relu_pattern,
+                                      {p_mm_output},
+                                      {tensor_attribute_pattern_match_all()},
+                                      "relu"));
+
+      OutputOperatorAttrsAssignment fused_mm_relu_attrs_assignment =
+          OutputOperatorAttrsAssignment{
+              b.pattern_node_named("mm"),
+              fused_mm_relu_attr_assignments,
+          };
+      OutputGraphExprValue o_fused_output =
+          get_only(b.add_output_graph_node(fused_mm_relu_attrs_assignment,
+                                           {o_input, o_weight},
+                                           nonnegative_int{1}));
+
+      b.equate_outputs(p_relu_output, o_fused_output);
+
+      return b.get_substitution();
+    }();
+
+    CHECK(is_isomorphic_to(result, correct));
+  }
+}
diff --git a/lib/substitutions/test/src/substitutions/unity_substitution_set.cc b/lib/substitutions/test/src/substitutions/unity_substitution_set.cc
new file mode 100644
index 0000000000..804fa99bef
--- /dev/null
+++ b/lib/substitutions/test/src/substitutions/unity_substitution_set.cc
@@ -0,0 +1,20 @@
+#include "substitutions/unity_substitution_set.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("get_substitution_set") {
+    MachineSpecification machine_spec = MachineSpecification{
+        /*num_nodes=*/2_n,
+        /*num_cpus_per_node=*/8_n,
+        /*num_gpus_per_node=*/4_n,
+        /*inter_node_bandwidth=*/0.0,
+        /*intra_node_bandwidth=*/0.0,
+    };
+
+    std::vector<Substitution> result = get_substitution_set(machine_spec);
+
+    CHECK(result.size() == 36);
+  }
+}
diff --git a/lib/substitutions/test/src/test_pattern_matches.cc b/lib/substitutions/test/src/substitutions/unlabelled/find_pattern_matches.cc
similarity index 94%
rename from lib/substitutions/test/src/test_pattern_matches.cc
rename to lib/substitutions/test/src/substitutions/unlabelled/find_pattern_matches.cc
index aeedd65f82..ab79ad6ff6 100644
--- a/lib/substitutions/test/src/test_pattern_matches.cc
+++ b/lib/substitutions/test/src/substitutions/unlabelled/find_pattern_matches.cc
@@ -9,7 +9,6 @@
 #include "utils/graph/open_dataflow_graph/algorithms/get_subgraph.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_subgraph_inputs.h"
 #include "utils/graph/open_dataflow_graph/open_dataflow_graph.h"
-#include "utils/overload.h"
 #include <doctest/doctest.h>
 
 using namespace FlexFlow;
@@ -59,30 +58,30 @@ namespace rc {
 //     OpenMultiDiGraphView subgraph =
 //         get_subgraph<OpenMultiDiSubgraphView>(as_openmultidigraph(g),
 //         subgraph_nodes);
-
+//
 //     std::vector<MultiDiGraphPatternMatch> matches =
 //         find_pattern_matches(subgraph, as_openmultidigraph(g), AlwaysTrue{});
-
+//
 //     RC_ASSERT(!matches.empty());
-
+//
 //     for (MultiDiGraphPatternMatch const &match : matches) {
 //       RC_ASSERT(pattern_matches(subgraph, as_openmultidigraph(g), match,
 //       AlwaysTrue{}));
 //     }
 //   });
-// }
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("find_pattern_matches_small") {
+  TEST_CASE("find_pattern_matches") {
     OpenDataflowGraph pattern_graph =
         OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult pattern_n0_added = pattern_graph.add_node({}, 1);
+    NodeAddedResult pattern_n0_added = pattern_graph.add_node({}, 1_n);
     Node pattern_n0 = pattern_n0_added.node;
     OpenDataflowValue pattern_v0 =
         OpenDataflowValue{get_only(pattern_n0_added.outputs)};
 
-    NodeAddedResult pattern_n1_added = pattern_graph.add_node({pattern_v0}, 1);
+    NodeAddedResult pattern_n1_added =
+        pattern_graph.add_node({pattern_v0}, 1_n);
     Node pattern_n1 = pattern_n1_added.node;
     OpenDataflowValue pattern_v1 =
         OpenDataflowValue{get_only(pattern_n1_added.outputs)};
@@ -94,19 +93,19 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpenDataflowGraph graph =
         OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n0_added = graph.add_node({}, 1);
+    NodeAddedResult n0_added = graph.add_node({}, 1_n);
     Node n0 = n0_added.node;
     OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)};
 
-    NodeAddedResult n1_added = graph.add_node({v0}, 1);
+    NodeAddedResult n1_added = graph.add_node({v0}, 1_n);
     Node n1 = n1_added.node;
     OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)};
 
-    NodeAddedResult n2_added = graph.add_node({v1}, 1);
+    NodeAddedResult n2_added = graph.add_node({v1}, 1_n);
     Node n2 = n2_added.node;
     OpenDataflowValue v2 = OpenDataflowValue{get_only(n2_added.outputs)};
 
-    NodeAddedResult n3_added = graph.add_node({v2}, 1);
+    NodeAddedResult n3_added = graph.add_node({v2}, 1_n);
     Node n3 = n3_added.node;
     OpenDataflowValue v3 = OpenDataflowValue{get_only(n3_added.outputs)};
 
@@ -128,8 +127,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     std::vector<OpenDataflowEdge> n1_incoming = {OpenDataflowEdge{
         DataflowEdge{
-            DataflowOutput{n0, 0},
-            DataflowInput{n1, 0},
+            DataflowOutput{n0, 0_n},
+            DataflowInput{n1, 0_n},
         },
     }};
 
@@ -201,7 +200,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
       DataflowGraphInput i0 = g.add_input();
 
-      NodeAddedResult g_n0_added = g.add_node({OpenDataflowValue{i0}}, 1);
+      NodeAddedResult g_n0_added = g.add_node({OpenDataflowValue{i0}}, 1_n);
       Node g_n0 = g_n0_added.node;
       OpenDataflowValue g_v0 = OpenDataflowValue{get_only(g_n0_added.outputs)};
       PatternNode g_p0 = PatternNode{g_n0};
diff --git a/lib/substitutions/test/src/substitutions/unlabelled/pattern_matching.cc b/lib/substitutions/test/src/substitutions/unlabelled/pattern_matching.cc
new file mode 100644
index 0000000000..8fd468d186
--- /dev/null
+++ b/lib/substitutions/test/src/substitutions/unlabelled/pattern_matching.cc
@@ -0,0 +1,210 @@
+#include "substitutions/unlabelled/pattern_matching.h"
+#include "substitutions/unlabelled/find_pattern_matches.h"
+#include "substitutions/unlabelled/match_additional_criterion.h"
+#include "utils/containers/get_only.h"
+#include "utils/graph/instances/unordered_set_dataflow_graph.h"
+#include "utils/graph/node/algorithms.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_values.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_subgraph.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_subgraph_inputs.h"
+#include "utils/graph/open_dataflow_graph/open_dataflow_graph.h"
+#include "utils/overload.h"
+#include <doctest/doctest.h>
+
+using namespace FlexFlow;
+
+namespace rc {
+
+// template <>
+// struct Arbitrary<MultiDiGraph> {
+//   static int const MAX_GRAPH_SIZE = 200;
+//   static int const MAX_EDGE_SIZE = 1000;
+//
+//   static Gen<MultiDiGraph> arbitrary() {
+//     return gen::exec([&] {
+//       int num_nodes = *gen::inRange(1, MAX_GRAPH_SIZE + 1);
+//       MultiDiGraph g = MultiDiGraph::template
+//       create<AdjacencyMultiDiGraph>();
+//
+//       std::vector<Node> nodes;
+//       for (int i = 0; i < num_nodes; ++i) {
+//         nodes.push_back(g.add_node());
+//       }
+//
+//       int num_edges = *gen::inRange(1, MAX_GRAPH_SIZE + 1);
+//       for (int i = 0; i < num_edges; ++i) {
+//         int src_id = *gen::inRange(0, num_nodes);
+//         int dst_id = *gen::inRange(0, num_nodes);
+//         if (src_id > dst_id) {
+//           std::swap(src_id, dst_id);
+//         }
+//
+//         g.add_edge(MultiDiEdge{nodes[dst_id],
+//                                g.add_node_port(),
+//                                nodes[src_id],
+//                                g.add_node_port()});
+//       }
+//
+//       return g;
+//     });
+//   }
+// };
+
+} // namespace rc
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("find_pattern_matches") {
+    OpenDataflowGraph pattern_graph =
+        OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
+
+    NodeAddedResult pattern_n0_added = pattern_graph.add_node({}, 1_n);
+    Node pattern_n0 = pattern_n0_added.node;
+    OpenDataflowValue pattern_v0 =
+        OpenDataflowValue{get_only(pattern_n0_added.outputs)};
+
+    NodeAddedResult pattern_n1_added =
+        pattern_graph.add_node({pattern_v0}, 1_n);
+    Node pattern_n1 = pattern_n1_added.node;
+    OpenDataflowValue pattern_v1 =
+        OpenDataflowValue{get_only(pattern_n1_added.outputs)};
+
+    UnlabelledGraphPattern pattern = UnlabelledGraphPattern{pattern_graph};
+    PatternNode p0 = PatternNode{pattern_n0};
+    PatternNode p1 = PatternNode{pattern_n1};
+
+    OpenDataflowGraph graph =
+        OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
+
+    NodeAddedResult n0_added = graph.add_node({}, 1_n);
+    Node n0 = n0_added.node;
+    OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)};
+
+    NodeAddedResult n1_added = graph.add_node({v0}, 1_n);
+    Node n1 = n1_added.node;
+    OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)};
+
+    NodeAddedResult n2_added = graph.add_node({v1}, 1_n);
+    Node n2 = n2_added.node;
+    OpenDataflowValue v2 = OpenDataflowValue{get_only(n2_added.outputs)};
+
+    NodeAddedResult n3_added = graph.add_node({v2}, 1_n);
+    Node n3 = n3_added.node;
+    OpenDataflowValue v3 = OpenDataflowValue{get_only(n3_added.outputs)};
+
+    UnlabelledDataflowGraphPatternMatch match =
+        UnlabelledDataflowGraphPatternMatch{
+            bidict<PatternNode, Node>{
+                {p0, n0},
+                {p1, n1},
+            },
+            bidict<PatternInput, OpenDataflowValue>{}};
+
+    UnlabelledDataflowGraphPatternMatch invalid_match =
+        UnlabelledDataflowGraphPatternMatch{
+            bidict<PatternNode, Node>{
+                {p0, n1},
+                {p1, n2},
+            },
+            bidict<PatternInput, OpenDataflowValue>{}};
+
+    std::vector<OpenDataflowEdge> n1_incoming = {OpenDataflowEdge{
+        DataflowEdge{
+            DataflowOutput{n0, 0_n},
+            DataflowInput{n1, 0_n},
+        },
+    }};
+
+    SUBCASE("get_incoming_edges") {
+      SUBCASE("n0") {
+        std::vector<OpenDataflowEdge> result = get_incoming_edges(graph, n0);
+        std::vector<OpenDataflowEdge> correct = {};
+        CHECK(result == correct);
+      }
+      SUBCASE("n1") {
+        std::vector<OpenDataflowEdge> result = get_incoming_edges(graph, n1);
+        std::vector<OpenDataflowEdge> correct = n1_incoming;
+        CHECK(result == correct);
+      }
+      SUBCASE("both") {
+        std::unordered_map<Node, std::vector<OpenDataflowEdge>> result =
+            get_incoming_edges(graph, {n0, n1});
+        std::unordered_map<Node, std::vector<OpenDataflowEdge>> correct = {
+            {n0, {}}, {n1, n1_incoming}};
+        CHECK(result == correct);
+      }
+    }
+
+    SUBCASE("get_subgraph_inputs") {
+      std::unordered_set<OpenDataflowValue> result =
+          get_subgraph_inputs(graph, {n0, n1});
+      std::unordered_set<OpenDataflowValue> correct = {};
+      CHECK(result == correct);
+    }
+
+    SUBCASE("get_subgraph") {
+      OpenDataflowGraphView g = get_subgraph(graph, {n0, n1}).graph;
+      SUBCASE("nodes") {
+        std::unordered_set<Node> result = get_nodes(g);
+        std::unordered_set<Node> correct = {n0, n1};
+        CHECK(result == correct);
+      }
+      SUBCASE("inputs") {
+        std::unordered_set<DataflowGraphInput> result = g.get_inputs();
+        std::unordered_set<DataflowGraphInput> correct = {};
+        CHECK(result == correct);
+      }
+      SUBCASE("get_open_dataflow_values") {
+        std::unordered_set<OpenDataflowValue> values =
+            get_open_dataflow_values(g);
+        CHECK(values.size() == 2);
+      }
+    }
+
+    SUBCASE("subgraph_matched") {
+      OpenDataflowGraphView result = subgraph_matched(graph, match).graph;
+      std::unordered_set<Node> result_nodes = get_nodes(result);
+      std::unordered_set<Node> correct_nodes = {n0, n1};
+      CHECK(result_nodes == correct_nodes);
+    }
+
+    SUBCASE("unlabelled_pattern_does_match") {
+      CHECK(unlabelled_pattern_does_match(
+          pattern, graph, match, match_additional_crition_always_true()));
+      CHECK_FALSE(unlabelled_pattern_does_match(
+          pattern,
+          graph,
+          invalid_match,
+          match_additional_crition_always_true()));
+    }
+
+    SUBCASE("unlabelled_pattern_does_match") {
+      OpenDataflowGraph g =
+          OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
+      DataflowGraphInput i0 = g.add_input();
+
+      NodeAddedResult g_n0_added = g.add_node({OpenDataflowValue{i0}}, 1_n);
+      Node g_n0 = g_n0_added.node;
+      OpenDataflowValue g_v0 = OpenDataflowValue{get_only(g_n0_added.outputs)};
+      PatternNode g_p0 = PatternNode{g_n0};
+      PatternInput g_pi0 = PatternInput{i0};
+
+      UnlabelledGraphPattern open_pattern = UnlabelledGraphPattern{g};
+
+      UnlabelledDataflowGraphPatternMatch open_match =
+          UnlabelledDataflowGraphPatternMatch{
+              bidict<PatternNode, Node>{
+                  {g_p0, n1},
+              },
+              bidict<PatternInput, OpenDataflowValue>{
+                  {g_pi0, v0},
+              }};
+
+      CHECK(unlabelled_pattern_does_match(
+          open_pattern,
+          graph,
+          open_match,
+          match_additional_crition_always_true()));
+    }
+  }
+}
diff --git a/lib/substitutions/test/src/substitutions/unlabelled/pattern_split.cc b/lib/substitutions/test/src/substitutions/unlabelled/pattern_split.cc
index e4d763d9c3..1bddb9f680 100644
--- a/lib/substitutions/test/src/substitutions/unlabelled/pattern_split.cc
+++ b/lib/substitutions/test/src/substitutions/unlabelled/pattern_split.cc
@@ -13,11 +13,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpenDataflowGraph g =
         OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n0_added = g.add_node({}, 1);
+    NodeAddedResult n0_added = g.add_node({}, 1_n);
     Node n0 = n0_added.node;
     OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)};
 
-    NodeAddedResult n1_added = g.add_node({v0}, 1);
+    NodeAddedResult n1_added = g.add_node({v0}, 1_n);
     Node n1 = n1_added.node;
     OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)};
 
@@ -77,11 +77,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     DataflowGraphInput i0 = g.add_input();
     DataflowGraphInput i1 = g.add_input();
 
-    NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1);
+    NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1_n);
     Node n0 = n0_added.node;
     OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)};
 
-    NodeAddedResult n1_added = g.add_node({OpenDataflowValue{i1}}, 1);
+    NodeAddedResult n1_added = g.add_node({OpenDataflowValue{i1}}, 1_n);
     Node n1 = n1_added.node;
     OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)};
 
diff --git a/lib/substitutions/test/src/substitutions/unlabelled/unlabelled_graph_pattern.cc b/lib/substitutions/test/src/substitutions/unlabelled/unlabelled_graph_pattern.cc
index e0805dbfd4..22d1b8a2a5 100644
--- a/lib/substitutions/test/src/substitutions/unlabelled/unlabelled_graph_pattern.cc
+++ b/lib/substitutions/test/src/substitutions/unlabelled/unlabelled_graph_pattern.cc
@@ -17,7 +17,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK_FALSE(is_singleton_pattern(pattern));
     }
 
-    NodeAddedResult n0_added = g.add_node({}, 1);
+    NodeAddedResult n0_added = g.add_node({}, 1_n);
     OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)};
 
     SUBCASE("1 node") {
@@ -26,7 +26,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(is_singleton_pattern(pattern));
     }
 
-    NodeAddedResult n1_added = g.add_node({v0}, 1);
+    NodeAddedResult n1_added = g.add_node({v0}, 1_n);
     OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)};
 
     SUBCASE("more than 1 node") {
diff --git a/lib/substitutions/test/src/test_substitution.cc b/lib/substitutions/test/src/test_substitution.cc
deleted file mode 100644
index dcb06a78fa..0000000000
--- a/lib/substitutions/test/src/test_substitution.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-#include "doctest/doctest.h"
-#include "op-attrs/get_op_type.h"
-#include "rapidcheck.h"
-#include "substitutions/substitution.h"
-
-using namespace FlexFlow;
-
-// TEST_SUITE(FF_TEST_SUITE) {
-//   TEST_CASE("substitution") {
-// PCGPattern pattern;
-// OutputGraphExpr output_expr;
-// bidict<DataflowGraphInput
-// Substitution s;
-//   }
-// }
-
-// TEST_SUITE(FF_TEST_SUITE) {
-//   TEST_CASE("apply_substitution") {
-//     OperatorPattern operator_pattern_n0{
-//         std::vector<OperatorAttributeConstraint>{
-//             OperatorAttributeConstraint{ConstraintType::EQUAL,
-//                                         OperatorAttributeKey::OP_TYPE,
-//                                         OperatorType::LINEAR}}};
-//
-//     ParallelTensorPattern tensor_pattern_e0{
-//         std::vector<TensorAttributeConstraint>{
-//             TensorAttributeConstraint{ConstraintType::EQUAL,
-//                                       ListIndexAccess<TensorAttributeKey>{
-//                                           TensorAttributeKey::DIM_SIZES, 0},
-//                                       2}}};
-//
-//     ParallelTensorPattern tensor_pattern_empty{
-//         std::vector<TensorAttributeConstraint>{}};
-//
-//     auto ig =
-//         OutputLabelledOpenMultiDiGraph<OperatorPattern,
-//         ParallelTensorPattern>::
-//             create<UnorderedOutputLabelledOpenMultiDiGraph<
-//                 OperatorPattern,
-//                 ParallelTensorPattern>>();
-//     Node n0 = ig.add_node(operator_pattern_n0);
-//     NodePort p0 = ig.add_node_port();
-//     InputMultiDiEdge e0{n0, p0, std::make_pair(p0.value(), p0.value())};
-//     ig.add_edge(e0);
-//     ig.add_label(e0, tensor_pattern_e0);
-//
-//     RC_ASSERT(get_nodes(ig).size() == 1);
-//     RC_ASSERT(get_edges(ig).size() == 1);
-//
-//     GraphPattern input_graph{ig};
-//
-//     OperatorAttrAssignment op_ass_n1{
-//         {{OperatorAttributeKey::OP_TYPE,
-//           AttrConstant{OperatorType::REPARTITION}},
-//          {OperatorAttributeKey::PARALLEL_DIM,
-//          AttrConstant{ff_dim_t{nonnegative_int{0}}}},
-//          {OperatorAttributeKey::PARALLEL_DEGREE, AttrConstant{2}}}};
-//
-//     OperatorAttrAssignment op_ass_n2{
-//         {{OperatorAttributeKey::OP_TYPE, AttrConstant{OperatorType::LINEAR}},
-//          {OperatorAttributeKey::OUT_CHANNELS,
-//           OperatorAttrAccess{n0, OperatorAttributeKey::OUT_CHANNELS}},
-//          {OperatorAttributeKey::USE_BIAS,
-//           OperatorAttrAccess{n0, OperatorAttributeKey::USE_BIAS}},
-//          {OperatorAttributeKey::DATA_TYPE,
-//           OperatorAttrAccess{n0, OperatorAttributeKey::DATA_TYPE}},
-//          {OperatorAttributeKey::ACTIVATION,
-//           OperatorAttrAccess{n0, OperatorAttributeKey::ACTIVATION}},
-//          {OperatorAttributeKey::REGULARIZER,
-//           OperatorAttrAccess{n0, OperatorAttributeKey::REGULARIZER}}}};
-//
-//     OperatorAttrAssignment op_ass_n3{
-//         {{OperatorAttributeKey::OP_TYPE,
-//         AttrConstant{OperatorType::REDUCTION}},
-//          {OperatorAttributeKey::PARALLEL_DIM,
-//          AttrConstant{ff_dim_t{nonnegative_int{0}}}},
-//          {OperatorAttributeKey::PARALLEL_DEGREE, AttrConstant{2}}}};
-//
-//     auto og = NodeLabelledOpenMultiDiGraph<OperatorAttrAssignment>::create<
-//         UnorderedNodeLabelledOpenMultiDiGraph<OperatorAttrAssignment>>();
-//     Node n1 = og.add_node(op_ass_n1);
-//     Node n2 = og.add_node(op_ass_n2);
-//     Node n3 = og.add_node(op_ass_n3);
-//     NodePort p1 = og.add_node_port();
-//     NodePort p2 = og.add_node_port();
-//     NodePort p3 = og.add_node_port();
-//     InputMultiDiEdge e1{n1, p1, {p1.value(), p1.value()}};
-//     MultiDiEdge e2{n2, p2, n1, p1};
-//     MultiDiEdge e3{n3, p3, n2, p2};
-//     og.add_edge(e1);
-//     og.add_edge(e2);
-//     og.add_edge(e3);
-//     OutputGraphExpr output_graph_expr{og};
-//
-//     RC_ASSERT(get_nodes(og).size() == 3);
-//     RC_ASSERT(get_edges(og).size() == 3);
-//
-//     bidict<InputMultiDiEdge, InputMultiDiEdge> input_mapping;
-//     input_mapping.equate(e0, e1);
-//     bidict<OutputMultiDiEdge, OutputMultiDiEdge> output_mapping;
-//
-//     Substitution substitution{
-//         input_graph, output_graph_expr, input_mapping, output_mapping};
-//
-//     SubParallelComputationGraph pcg =
-//         OutputLabelledOpenMultiDiGraph<Operator, ParallelTensor>::create<
-//             UnorderedOutputLabelledOpenMultiDiGraph<Operator,
-//                                                     ParallelTensor>>();
-//
-//     Node n4 = pcg.add_node(Operator{InputAttrs{}, "input"});
-//     Node n5 = pcg.add_node(Operator{
-//         LinearAttrs{1, false, DataType::FLOAT, Activation::RELU,
-//         std::nullopt}, "linear"});
-//     NodePort p4 = pcg.add_node_port();
-//     NodePort p5 = pcg.add_node_port();
-//
-//     MultiDiEdge e4{n5, p5, n4, p4};
-//     pcg.add_edge(e4);
-//     ParallelDim dim = {2, 1, false};
-//     ParallelTensorDims dims = {FFOrdered<ParallelDim>{dim}};
-//     pcg.add_label(e4, ParallelTensor(dims, DataType::FLOAT,
-//     CreateGrad::YES));
-//
-//     MatchAdditionalCriterion criterion{
-//         [&](Node const &pattern_node, Node const &graph_node) {
-//           return operator_satisfies(pcg.at(graph_node),
-//                                     input_graph.value().at(pattern_node));
-//         },
-//         [&](OpenMultiDiEdge const &pattern_edge,
-//             OpenMultiDiEdge const &graph_edge) {
-//           return parallel_tensor_satisfies(
-//               pcg.at(graph_edge), input_graph.value().at(pattern_edge));
-//         }};
-//
-//     RC_ASSERT(criterion.node_criterion(n0, n5));
-//
-//     std::vector<MultiDiGraphPatternMatch> matches =
-//         find_pattern_matches(input_graph, pcg, criterion);
-//
-//     RC_ASSERT(matches.size() == 1);
-//
-//     SubParallelComputationGraph new_pcg =
-//         apply_substitution(pcg, substitution, matches[0]);
-//
-//     RC_ASSERT(get_nodes(new_pcg).size() == 4);
-//     RC_ASSERT(get_edges(new_pcg).size() == 3);
-//   }
-// }
diff --git a/lib/utils/include/utils/bidict/algorithms/bidict_from_enumerating.h b/lib/utils/include/utils/bidict/algorithms/bidict_from_enumerating.h
index 86ef6c4b4d..83afc32e0c 100644
--- a/lib/utils/include/utils/bidict/algorithms/bidict_from_enumerating.h
+++ b/lib/utils/include/utils/bidict/algorithms/bidict_from_enumerating.h
@@ -2,14 +2,16 @@
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_BIDICT_ALGORITHMS_BIDICT_FROM_ENUMERATING_H
 
 #include "utils/bidict/bidict.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <set>
 
 namespace FlexFlow {
 
 template <typename T>
-bidict<int, T> bidict_from_enumerating(std::unordered_set<T> const &s) {
-  bidict<int, T> result;
-  int idx = 0;
+bidict<nonnegative_int, T>
+    bidict_from_enumerating(std::unordered_set<T> const &s) {
+  bidict<nonnegative_int, T> result;
+  nonnegative_int idx = 0_n;
   for (T const &t : s) {
     result.equate(idx, t);
     idx++;
@@ -19,9 +21,9 @@ bidict<int, T> bidict_from_enumerating(std::unordered_set<T> const &s) {
 }
 
 template <typename T>
-bidict<int, T> bidict_from_enumerating(std::set<T> const &s) {
-  bidict<int, T> result;
-  int idx = 0;
+bidict<nonnegative_int, T> bidict_from_enumerating(std::set<T> const &s) {
+  bidict<nonnegative_int, T> result;
+  nonnegative_int idx = 0_n;
   for (T const &t : s) {
     result.equate(idx, t);
     idx++;
diff --git a/lib/utils/include/utils/cli/cli_flag_key.struct.toml b/lib/utils/include/utils/cli/cli_flag_key.struct.toml
index 790a752911..9c02fddc3e 100644
--- a/lib/utils/include/utils/cli/cli_flag_key.struct.toml
+++ b/lib/utils/include/utils/cli/cli_flag_key.struct.toml
@@ -6,8 +6,10 @@ features = [
   "fmt",
 ]
 
-includes = []
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
 
 [[fields]]
 name = "raw_idx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/utils/include/utils/cli/cli_positional_argument_key.struct.toml b/lib/utils/include/utils/cli/cli_positional_argument_key.struct.toml
index d571d0deb3..4c50c277c0 100644
--- a/lib/utils/include/utils/cli/cli_positional_argument_key.struct.toml
+++ b/lib/utils/include/utils/cli/cli_positional_argument_key.struct.toml
@@ -6,8 +6,10 @@ features = [
   "fmt",
 ]
 
-includes = []
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
 
 [[fields]]
 name = "raw_idx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/utils/include/utils/containers/at_idx.h b/lib/utils/include/utils/containers/at_idx.h
index 757da5c548..fdc13a0231 100644
--- a/lib/utils/include/utils/containers/at_idx.h
+++ b/lib/utils/include/utils/containers/at_idx.h
@@ -1,17 +1,18 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_AT_IDX_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_AT_IDX_H
 
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <optional>
 #include <vector>
 
 namespace FlexFlow {
 
 template <typename E>
-std::optional<E> at_idx(std::vector<E> const &v, size_t idx) {
+std::optional<E> at_idx(std::vector<E> const &v, nonnegative_int idx) {
   if (idx >= v.size()) {
     return std::nullopt;
   } else {
-    return v.at(idx);
+    return v.at(idx.unwrap_nonnegative());
   }
 }
 
diff --git a/lib/utils/include/utils/containers/enumerate.h b/lib/utils/include/utils/containers/enumerate.h
index e3722e52c6..1e8bc1f3dc 100644
--- a/lib/utils/include/utils/containers/enumerate.h
+++ b/lib/utils/include/utils/containers/enumerate.h
@@ -11,14 +11,14 @@ namespace FlexFlow {
 /**
  * @brief Generate a map from indices to elements of \p c.
  *
- * @note We return a <tt>std::map<int, T></tt> rather than a
- * <tt>std::vector<std::pair<int, T>></tt> for consistency
+ * @note We return a <tt>std::map<nonnegative_int, T></tt> rather than a
+ * <tt>std::vector<std::pair<nonnegative_int, T>></tt> for consistency
  * with enumerate(FFOrdered<T> const &). Note that <tt>std::map</tt>
  * provides ordered iteration in increasing order, so iterating through
  * the result of this function should still function as expected.
  */
 template <typename T>
-std::map<int, T> enumerate(std::vector<T> const &c) {
+std::map<nonnegative_int, T> enumerate(std::vector<T> const &c) {
   return enumerate_vector(c);
 }
 
@@ -27,16 +27,16 @@ std::map<int, T> enumerate(std::vector<T> const &c) {
  * return a map from indices of this ordering to elements of \p c.
 
  *
- * @note We return a <tt>std::map<int, T></tt> rather than a
- * <tt>std::vector<std::pair<int, T>></tt> for consistency
+ * @note We return a <tt>std::map<nonnegative_int, T></tt> rather than a
+ * <tt>std::vector<std::pair<nonnegative_int, T>></tt> for consistency
  * with enumerate(FFOrdered<T> const &). Note that <tt>std::map</tt>
  * provides ordered iteration in increasing order, so iterating through
  * the result of this function should still function as expected.
  */
 template <typename T>
-std::map<int, T> enumerate(std::unordered_set<T> const &c) {
-  std::map<int, T> result;
-  int idx = 0;
+std::map<nonnegative_int, T> enumerate(std::unordered_set<T> const &c) {
+  std::map<nonnegative_int, T> result;
+  nonnegative_int idx = 0_n;
   for (auto const &v : c) {
     result.insert({idx++, v});
   }
diff --git a/lib/utils/include/utils/containers/enumerate_vector.h b/lib/utils/include/utils/containers/enumerate_vector.h
index 700106ea3f..1e66279306 100644
--- a/lib/utils/include/utils/containers/enumerate_vector.h
+++ b/lib/utils/include/utils/containers/enumerate_vector.h
@@ -1,16 +1,19 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_ENUMERATE_VECTOR_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_ENUMERATE_VECTOR_H
 
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/nonnegative_int/num_elements.h"
 #include <map>
 #include <vector>
 
 namespace FlexFlow {
 
 template <typename T>
-std::map<int, T> enumerate_vector(std::vector<T> const &v) {
-  std::map<int, T> result;
-  for (int i = 0; i < v.size(); i++) {
-    result.insert({i, v.at(i)});
+std::map<nonnegative_int, T> enumerate_vector(std::vector<T> const &v) {
+  std::map<nonnegative_int, T> result;
+  for (nonnegative_int i : nonnegative_range(num_elements(v))) {
+    result.insert({i, v.at(i.unwrap_nonnegative())});
   }
   return result;
 }
diff --git a/lib/utils/include/utils/containers/flatmap.h b/lib/utils/include/utils/containers/flatmap.h
index b016a1e03d..a7848b88aa 100644
--- a/lib/utils/include/utils/containers/flatmap.h
+++ b/lib/utils/include/utils/containers/flatmap.h
@@ -4,6 +4,7 @@
 #include "utils/containers/extend.h"
 #include "utils/containers/get_element_type.h"
 #include "utils/containers/merge_maps.h"
+#include <string>
 #include <type_traits>
 #include <unordered_map>
 
@@ -52,7 +53,19 @@ std::unordered_map<OutK, OutV> flatmap(std::unordered_map<InK, InV> const &m,
   std::unordered_map<OutK, OutV> result;
 
   for (auto const &[k, v] : m) {
-    result = merge_maps(result, f(k, v));
+    result = merge_disjoint_maps(result, f(k, v));
+  }
+
+  return result;
+}
+
+template <typename F>
+std::string flatmap(std::string const &input, F const &f) {
+  std::string result = "";
+
+  for (char c : input) {
+    std::string for_c = f(c);
+    result += for_c;
   }
 
   return result;
diff --git a/lib/utils/include/utils/containers/get_all_permutations_with_repetition.h b/lib/utils/include/utils/containers/get_all_permutations_with_repetition.h
index ccdde0131a..0a7e9d16c2 100644
--- a/lib/utils/include/utils/containers/get_all_permutations_with_repetition.h
+++ b/lib/utils/include/utils/containers/get_all_permutations_with_repetition.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_GET_ALL_PERMUTATIONS_WITH_REPETITION_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_GET_ALL_PERMUTATIONS_WITH_REPETITION_H
 
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <unordered_set>
 #include <vector>
 
@@ -14,7 +15,8 @@ namespace FlexFlow {
  **/
 template <typename C, typename T = typename C::value_type>
 std::unordered_multiset<std::vector<T>>
-    get_all_permutations_with_repetition(C const &container, int n) {
+    get_all_permutations_with_repetition(C const &container,
+                                         nonnegative_int n) {
   std::unordered_multiset<std::vector<T>> result;
 
   if (container.empty() || n == 0) {
@@ -22,16 +24,16 @@ std::unordered_multiset<std::vector<T>>
   }
 
   std::vector<T> elements(std::begin(container), std::end(container));
-  std::vector<int> indices(n, 0);
+  std::vector<int> indices(n.unwrap_nonnegative(), 0);
 
   while (true) {
-    std::vector<T> perm(n);
+    std::vector<T> perm(n.unwrap_nonnegative());
     for (int i = 0; i < n; ++i) {
       perm[i] = elements[indices[i]];
     }
     result.insert(perm);
 
-    int i = n - 1;
+    int i = n.unwrap_nonnegative() - 1;
     while (i != -1 && ++indices[i] == elements.size()) {
       indices[i] = 0;
       --i;
diff --git a/lib/utils/include/utils/containers/make.h b/lib/utils/include/utils/containers/make.h
new file mode 100644
index 0000000000..f7b15dfa02
--- /dev/null
+++ b/lib/utils/include/utils/containers/make.h
@@ -0,0 +1,13 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_MAKE_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_MAKE_H
+
+namespace FlexFlow {
+
+template <typename T>
+decltype(auto) make() {
+  return [](auto const &x) { return T{x}; };
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/containers/merge_maps.h b/lib/utils/include/utils/containers/merge_maps.h
index dd886ab8aa..bfc2446d99 100644
--- a/lib/utils/include/utils/containers/merge_maps.h
+++ b/lib/utils/include/utils/containers/merge_maps.h
@@ -3,30 +3,64 @@
 
 #include "utils/containers/are_disjoint.h"
 #include "utils/containers/keys.h"
+#include "utils/containers/merge_method.dtg.h"
 #include "utils/exception.h"
 #include "utils/fmt/unordered_map.h"
+#include "utils/fmt/unordered_set.h"
 #include <unordered_map>
 
 namespace FlexFlow {
 
 template <typename K, typename V>
-std::unordered_map<K, V> merge_maps(std::unordered_map<K, V> const &lhs,
-                                    std::unordered_map<K, V> const &rhs) {
-  if (!are_disjoint(keys(lhs), keys(rhs))) {
-    throw mk_runtime_error(fmt::format("Key sets of merge_maps parameters are "
-                                       "non-disjoint: lhs = {}, rhs = {}",
-                                       lhs,
-                                       rhs));
+void merge_in_map(std::unordered_map<K, V> const &m,
+                  std::unordered_map<K, V> &result) {
+  for (auto const &[k, v] : m) {
+    auto it = result.find(k);
+    if (it != result.end()) {
+      it->second = v;
+    } else {
+      result.insert({k, v});
+    }
   }
+}
 
-  std::unordered_map<K, V> result;
-  for (auto const &kv : lhs) {
-    result.insert(kv);
-  }
-  for (auto const &kv : rhs) {
-    result.insert(kv);
+template <typename K, typename V>
+std::unordered_map<K, V>
+    merge_disjoint_maps(std::unordered_map<K, V> const &lhs,
+                        std::unordered_map<K, V> const &rhs) {
+
+  std::unordered_set<K> lhs_keys = keys(lhs);
+  std::unordered_set<K> rhs_keys = keys(rhs);
+  std::unordered_set<K> shared_keys = intersection(lhs_keys, rhs_keys);
+  if (!shared_keys.empty()) {
+    throw mk_runtime_error(
+        fmt::format("merge_maps expected disjoint maps, but maps share keys {}",
+                    shared_keys));
   }
 
+  std::unordered_map<K, V> result;
+  merge_in_map(lhs, result);
+  merge_in_map(rhs, result);
+  return result;
+}
+
+template <typename K, typename V>
+std::unordered_map<K, V>
+    merge_map_left_dominates(std::unordered_map<K, V> const &lhs,
+                             std::unordered_map<K, V> const &rhs) {
+  std::unordered_map<K, V> result;
+  merge_in_map(rhs, result);
+  merge_in_map(lhs, result);
+  return result;
+}
+
+template <typename K, typename V>
+std::unordered_map<K, V>
+    merge_map_right_dominates(std::unordered_map<K, V> const &lhs,
+                              std::unordered_map<K, V> const &rhs) {
+  std::unordered_map<K, V> result;
+  merge_in_map(lhs, result);
+  merge_in_map(rhs, result);
   return result;
 }
 
diff --git a/lib/utils/include/utils/containers/merge_method.enum.toml b/lib/utils/include/utils/containers/merge_method.enum.toml
new file mode 100644
index 0000000000..ec0ed067dd
--- /dev/null
+++ b/lib/utils/include/utils/containers/merge_method.enum.toml
@@ -0,0 +1,17 @@
+namespace = "FlexFlow"
+name = "MergeMethod"
+features = [
+  "json",
+  "hash",
+  "fmt",
+  "rapidcheck",
+]
+
+[[values]]
+name = "REQUIRE_DISJOINT"
+
+[[values]]
+name = "LEFT_DOMINATES"
+
+[[values]]
+name = "RIGHT_DOMINATES"
diff --git a/lib/utils/include/utils/containers/product.h b/lib/utils/include/utils/containers/product.h
index af04edcb81..30aac2681a 100644
--- a/lib/utils/include/utils/containers/product.h
+++ b/lib/utils/include/utils/containers/product.h
@@ -10,7 +10,7 @@ namespace FlexFlow {
  **/
 template <typename Container, typename Element = typename Container::value_type>
 Element product(Container const &container) {
-  Element result = 1;
+  Element result = Element{1};
   for (Element const &element : container) {
     result *= element;
   }
diff --git a/lib/utils/include/utils/containers/repeat.h b/lib/utils/include/utils/containers/repeat.h
index 18de92cf4a..9782d6265a 100644
--- a/lib/utils/include/utils/containers/repeat.h
+++ b/lib/utils/include/utils/containers/repeat.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPEAT_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPEAT_H
 
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <cassert>
 #include <type_traits>
 #include <vector>
@@ -8,9 +9,7 @@
 namespace FlexFlow {
 
 template <typename F, typename Out = std::invoke_result_t<F>>
-std::vector<Out> repeat(int n, F const &f) {
-  assert(n >= 0);
-
+std::vector<Out> repeat(nonnegative_int n, F const &f) {
   std::vector<Out> result;
   for (int i = 0; i < n; i++) {
     result.push_back(f());
diff --git a/lib/utils/include/utils/containers/repeat_element.h b/lib/utils/include/utils/containers/repeat_element.h
new file mode 100644
index 0000000000..e1ac508116
--- /dev/null
+++ b/lib/utils/include/utils/containers/repeat_element.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPLICATE_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPLICATE_H
+
+#include "utils/exception.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include <fmt/format.h>
+#include <vector>
+
+namespace FlexFlow {
+
+template <typename T>
+std::vector<T> repeat_element(nonnegative_int num_times, T const &element) {
+  std::vector<T> result;
+  for (int i = 0; i < num_times; ++i) {
+    result.push_back(element);
+  }
+  return result;
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/containers/replicate.h b/lib/utils/include/utils/containers/replicate.h
deleted file mode 100644
index aa3d0a7e35..0000000000
--- a/lib/utils/include/utils/containers/replicate.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPLICATE_H
-#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPLICATE_H
-
-#include <vector>
-
-namespace FlexFlow {
-
-template <typename T>
-std::vector<T> replicate(int n, T const &element) {
-  return std::vector<T>(n, element);
-}
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/utils/include/utils/containers/sum.h b/lib/utils/include/utils/containers/sum.h
index 135e704045..d6061e396e 100644
--- a/lib/utils/include/utils/containers/sum.h
+++ b/lib/utils/include/utils/containers/sum.h
@@ -8,7 +8,7 @@ namespace FlexFlow {
  **/
 template <typename Container, typename Element = typename Container::value_type>
 Element sum(Container const &container) {
-  Element result = 0;
+  Element result = Element{0};
   for (Element const &element : container) {
     result += element;
   }
diff --git a/lib/utils/include/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h b/lib/utils/include/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h
new file mode 100644
index 0000000000..b12e20124f
--- /dev/null
+++ b/lib/utils/include/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h
@@ -0,0 +1,34 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_VIEW_AS_OPEN_DATAFLOW_GRAPH_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_VIEW_AS_OPEN_DATAFLOW_GRAPH_H
+
+#include "utils/graph/dataflow_graph/dataflow_graph_view.h"
+#include "utils/graph/open_dataflow_graph/open_dataflow_graph_view.h"
+
+namespace FlexFlow {
+
+struct ViewDataflowGraphAsOpenDataflowGraph final
+    : public IOpenDataflowGraphView {
+
+  ViewDataflowGraphAsOpenDataflowGraph() = delete;
+  ViewDataflowGraphAsOpenDataflowGraph(DataflowGraphView const &);
+
+  std::unordered_set<Node> query_nodes(NodeQuery const &) const override;
+  std::unordered_set<DataflowOutput>
+      query_outputs(DataflowOutputQuery const &) const override;
+  std::unordered_set<DataflowGraphInput> get_inputs() const override;
+  std::unordered_set<OpenDataflowEdge>
+      query_edges(OpenDataflowEdgeQuery const &) const override;
+
+  ViewDataflowGraphAsOpenDataflowGraph *clone() const override;
+
+  virtual ~ViewDataflowGraphAsOpenDataflowGraph() = default;
+
+private:
+  DataflowGraphView g;
+};
+
+OpenDataflowGraphView view_as_open_dataflow_graph(DataflowGraphView const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/graph/dataflow_graph/dataflow_edge_query.struct.toml b/lib/utils/include/utils/graph/dataflow_graph/dataflow_edge_query.struct.toml
index 0b0c5a41d8..aed0c28aeb 100644
--- a/lib/utils/include/utils/graph/dataflow_graph/dataflow_edge_query.struct.toml
+++ b/lib/utils/include/utils/graph/dataflow_graph/dataflow_edge_query.struct.toml
@@ -10,6 +10,7 @@ features = [
 includes = [
   "utils/graph/query_set.h",
   "utils/graph/node/node.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -18,7 +19,7 @@ type = "::FlexFlow::query_set<::FlexFlow::Node>"
 
 [[fields]]
 name = "src_idxs"
-type = "::FlexFlow::query_set<int>"
+type = "::FlexFlow::query_set<::FlexFlow::nonnegative_int>"
 
 [[fields]]
 name = "dst_nodes"
@@ -26,4 +27,4 @@ type = "::FlexFlow::query_set<::FlexFlow::Node>"
 
 [[fields]]
 name = "dst_idxs"
-type = "::FlexFlow::query_set<int>"
+type = "::FlexFlow::query_set<::FlexFlow::nonnegative_int>"
diff --git a/lib/utils/include/utils/graph/dataflow_graph/dataflow_graph.h b/lib/utils/include/utils/graph/dataflow_graph/dataflow_graph.h
index 6a1898dd13..58c28aaff6 100644
--- a/lib/utils/include/utils/graph/dataflow_graph/dataflow_graph.h
+++ b/lib/utils/include/utils/graph/dataflow_graph/dataflow_graph.h
@@ -4,13 +4,14 @@
 #include "utils/graph/dataflow_graph/dataflow_graph_view.h"
 #include "utils/graph/dataflow_graph/i_dataflow_graph.h"
 #include "utils/graph/dataflow_graph/node_added_result.dtg.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
 struct DataflowGraph : virtual public DataflowGraphView {
 public:
   NodeAddedResult add_node(std::vector<DataflowOutput> const &inputs,
-                           int num_outputs);
+                           nonnegative_int num_outputs);
 
   void add_node_unsafe(Node const &node,
                        std::vector<DataflowOutput> const &inputs,
diff --git a/lib/utils/include/utils/graph/dataflow_graph/dataflow_input.struct.toml b/lib/utils/include/utils/graph/dataflow_graph/dataflow_input.struct.toml
index f322fa63fe..eb9c30d558 100644
--- a/lib/utils/include/utils/graph/dataflow_graph/dataflow_input.struct.toml
+++ b/lib/utils/include/utils/graph/dataflow_graph/dataflow_input.struct.toml
@@ -9,6 +9,7 @@ features = [
 
 includes = [
   "utils/graph/node/node.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -17,4 +18,4 @@ type = "::FlexFlow::Node"
 
 [[fields]]
 name = "idx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/utils/include/utils/graph/dataflow_graph/dataflow_output.struct.toml b/lib/utils/include/utils/graph/dataflow_graph/dataflow_output.struct.toml
index f3ccebe046..19d92a3d4c 100644
--- a/lib/utils/include/utils/graph/dataflow_graph/dataflow_output.struct.toml
+++ b/lib/utils/include/utils/graph/dataflow_graph/dataflow_output.struct.toml
@@ -9,6 +9,7 @@ features = [
 
 includes = [
   "utils/graph/node/node.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -17,4 +18,4 @@ type = "::FlexFlow::Node"
 
 [[fields]]
 name = "idx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/utils/include/utils/graph/dataflow_graph/dataflow_output_query.struct.toml b/lib/utils/include/utils/graph/dataflow_graph/dataflow_output_query.struct.toml
index 0701855ba6..d1af6d5c0d 100644
--- a/lib/utils/include/utils/graph/dataflow_graph/dataflow_output_query.struct.toml
+++ b/lib/utils/include/utils/graph/dataflow_graph/dataflow_output_query.struct.toml
@@ -10,6 +10,10 @@ features = [
 includes = [
   "utils/graph/query_set.h",
   "utils/graph/node/node.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
+src_includes = [
   "utils/fmt/unordered_set.h",
 ]
 
@@ -19,4 +23,4 @@ type = "::FlexFlow::query_set<::FlexFlow::Node>"
 
 [[fields]]
 name = "output_idxs"
-type = "::FlexFlow::query_set<int>"
+type = "::FlexFlow::query_set<::FlexFlow::nonnegative_int>"
diff --git a/lib/utils/include/utils/graph/dataflow_graph/i_dataflow_graph.h b/lib/utils/include/utils/graph/dataflow_graph/i_dataflow_graph.h
index 87882a6242..2572fe5c68 100644
--- a/lib/utils/include/utils/graph/dataflow_graph/i_dataflow_graph.h
+++ b/lib/utils/include/utils/graph/dataflow_graph/i_dataflow_graph.h
@@ -9,7 +9,7 @@ namespace FlexFlow {
 
 struct IDataflowGraph : virtual public IDataflowGraphView {
   virtual NodeAddedResult add_node(std::vector<DataflowOutput> const &inputs,
-                                   int num_outputs) = 0;
+                                   nonnegative_int num_outputs) = 0;
 
   virtual void add_node_unsafe(Node const &node,
                                std::vector<DataflowOutput> const &inputs,
diff --git a/lib/utils/include/utils/graph/instances/unordered_set_dataflow_graph.h b/lib/utils/include/utils/graph/instances/unordered_set_dataflow_graph.h
index 4ed83834a2..ecba7921af 100644
--- a/lib/utils/include/utils/graph/instances/unordered_set_dataflow_graph.h
+++ b/lib/utils/include/utils/graph/instances/unordered_set_dataflow_graph.h
@@ -14,9 +14,9 @@ struct UnorderedSetDataflowGraph final : virtual public IDataflowGraph,
   UnorderedSetDataflowGraph();
 
   NodeAddedResult add_node(std::vector<DataflowOutput> const &inputs,
-                           int num_outputs) override;
+                           nonnegative_int num_outputs) override;
   NodeAddedResult add_node(std::vector<OpenDataflowValue> const &inputs,
-                           int num_outputs) override;
+                           nonnegative_int num_outputs) override;
   DataflowGraphInput add_input() override;
 
   std::unordered_set<Node> query_nodes(NodeQuery const &) const override;
diff --git a/lib/utils/include/utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h b/lib/utils/include/utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h
index f1063c1f21..159778bb6d 100644
--- a/lib/utils/include/utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h
+++ b/lib/utils/include/utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h
@@ -57,9 +57,10 @@ struct UnorderedSetLabelledOpenDataflowGraph final
     }
 
     std::vector<DataflowOutput> new_outputs =
-        transform(count(output_labels.size()), [&](int output_idx) {
-          return DataflowOutput{new_node, output_idx};
-        });
+        transform(nonnegative_range(num_elements(output_labels)),
+                  [&](nonnegative_int output_idx) {
+                    return DataflowOutput{new_node, output_idx};
+                  });
 
     for (auto const &[output, output_label] : zip(new_outputs, output_labels)) {
       this->values.insert({OpenDataflowValue{output}, output_label});
diff --git a/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/get_graph_data.h b/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/get_graph_data.h
index ec8f025ac3..2115a03cda 100644
--- a/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/get_graph_data.h
+++ b/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/get_graph_data.h
@@ -4,6 +4,7 @@
 #include "utils/graph/labelled_open_dataflow_graph/algorithms/labelled_open_dataflow_graph_data.dtg.h"
 #include "utils/graph/labelled_open_dataflow_graph/labelled_open_dataflow_graph_view.h"
 #include "utils/graph/node/algorithms.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_edges.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_values.h"
 
 namespace FlexFlow {
diff --git a/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/permute_node_ids.h b/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/permute_node_ids.h
index 2d1dd03755..88950635d2 100644
--- a/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/permute_node_ids.h
+++ b/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/permute_node_ids.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_LABELLED_OPEN_DATAFLOW_GRAPH_ALGORITHMS_PERMUTE_NODE_IDS_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_LABELLED_OPEN_DATAFLOW_GRAPH_ALGORITHMS_PERMUTE_NODE_IDS_H
 
+#include "utils/containers/generate_map.h"
 #include "utils/graph/labelled_open_dataflow_graph/algorithms/with_labelling.h"
 #include "utils/graph/labelled_open_dataflow_graph/labelled_open_dataflow_graph_view.h"
 #include "utils/graph/node/algorithms.h"
diff --git a/lib/utils/include/utils/graph/multidigraph/algorithms/add_nodes.h b/lib/utils/include/utils/graph/multidigraph/algorithms/add_nodes.h
index 737f2d0d23..80d0ca3eaf 100644
--- a/lib/utils/include/utils/graph/multidigraph/algorithms/add_nodes.h
+++ b/lib/utils/include/utils/graph/multidigraph/algorithms/add_nodes.h
@@ -2,10 +2,11 @@
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_MULTIDIGRAPH_ALGORITHMS_ADD_NODES_H
 
 #include "utils/graph/multidigraph/multidigraph.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
-std::vector<Node> add_nodes(MultiDiGraph &, int num_nodes);
+std::vector<Node> add_nodes(MultiDiGraph &, nonnegative_int num_nodes);
 
 } // namespace FlexFlow
 
diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h
new file mode 100644
index 0000000000..ae99e2850f
--- /dev/null
+++ b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h
@@ -0,0 +1,13 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_ARE_ISOMORPHIC_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_ARE_ISOMORPHIC_H
+
+#include "utils/graph/open_dataflow_graph/open_dataflow_graph_view.h"
+
+namespace FlexFlow {
+
+bool are_isomorphic(OpenDataflowGraphView const &,
+                    OpenDataflowGraphView const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h
new file mode 100644
index 0000000000..fe282a8c2e
--- /dev/null
+++ b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_OPEN_DATAFLOW_GRAPH_ISOMORPHISM_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_OPEN_DATAFLOW_GRAPH_ISOMORPHISM_H
+
+#include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.dtg.h"
+#include "utils/graph/open_dataflow_graph/open_dataflow_value.dtg.h"
+
+namespace FlexFlow {
+
+OpenDataflowValue isomorphism_map_r_open_dataflow_value_from_l(
+    OpenDataflowGraphIsomorphism const &iso, OpenDataflowValue const &l_value);
+OpenDataflowValue isomorphism_map_l_open_dataflow_value_from_r(
+    OpenDataflowGraphIsomorphism const &iso, OpenDataflowValue const &r_value);
+
+DataflowOutput isomorphism_map_r_dataflow_output_from_l(
+    OpenDataflowGraphIsomorphism const &iso, DataflowOutput const &l_output);
+DataflowOutput isomorphism_map_l_dataflow_output_from_r(
+    OpenDataflowGraphIsomorphism const &iso, DataflowOutput const &r_output);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/dataflow_input_edge_query.struct.toml b/lib/utils/include/utils/graph/open_dataflow_graph/dataflow_input_edge_query.struct.toml
index 544a05af85..f67e8b88e0 100644
--- a/lib/utils/include/utils/graph/open_dataflow_graph/dataflow_input_edge_query.struct.toml
+++ b/lib/utils/include/utils/graph/open_dataflow_graph/dataflow_input_edge_query.struct.toml
@@ -11,6 +11,7 @@ includes = [
   "utils/graph/query_set.h",
   "utils/graph/open_dataflow_graph/dataflow_graph_input.dtg.h",
   "utils/graph/node/node.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -23,4 +24,4 @@ type = "::FlexFlow::query_set<::FlexFlow::Node>"
 
 [[fields]]
 name = "dst_idxs"
-type = "::FlexFlow::query_set<int>"
+type = "::FlexFlow::query_set<::FlexFlow::nonnegative_int>"
diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/i_open_dataflow_graph.h b/lib/utils/include/utils/graph/open_dataflow_graph/i_open_dataflow_graph.h
index 6edfa408d4..9b71b06e62 100644
--- a/lib/utils/include/utils/graph/open_dataflow_graph/i_open_dataflow_graph.h
+++ b/lib/utils/include/utils/graph/open_dataflow_graph/i_open_dataflow_graph.h
@@ -9,7 +9,7 @@ namespace FlexFlow {
 
 struct IOpenDataflowGraph : virtual public IOpenDataflowGraphView {
   virtual NodeAddedResult add_node(std::vector<OpenDataflowValue> const &inputs,
-                                   int num_outputs) = 0;
+                                   nonnegative_int num_outputs) = 0;
   virtual DataflowGraphInput add_input() = 0;
   virtual IOpenDataflowGraph *clone() const = 0;
 
diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_edge.h b/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_edge.h
index 09499f8e5f..1102bf0586 100644
--- a/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_edge.h
+++ b/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_edge.h
@@ -7,7 +7,7 @@
 namespace FlexFlow {
 
 Node get_open_dataflow_edge_dst_node(OpenDataflowEdge const &);
-int get_open_dataflow_edge_dst_idx(OpenDataflowEdge const &);
+nonnegative_int get_open_dataflow_edge_dst_idx(OpenDataflowEdge const &);
 DataflowInput get_open_dataflow_edge_dst(OpenDataflowEdge const &);
 OpenDataflowValue get_open_dataflow_edge_src(OpenDataflowEdge const &);
 OpenDataflowEdge
diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_graph.h b/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_graph.h
index e8ecce76e8..9d48020d5f 100644
--- a/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_graph.h
+++ b/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_graph.h
@@ -11,7 +11,7 @@ namespace FlexFlow {
 struct OpenDataflowGraph : virtual public OpenDataflowGraphView {
 public:
   NodeAddedResult add_node(std::vector<OpenDataflowValue> const &inputs,
-                           int num_outputs);
+                           nonnegative_int num_outputs);
   DataflowGraphInput add_input();
 
   template <typename T, typename... Args>
diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.h b/lib/utils/include/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.h
index 7b921772d6..f3d54e4329 100644
--- a/lib/utils/include/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.h
+++ b/lib/utils/include/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.h
@@ -12,7 +12,7 @@ struct UnorderedSetOpenDataflowGraph : public IOpenDataflowGraph {
   UnorderedSetOpenDataflowGraph();
 
   NodeAddedResult add_node(std::vector<OpenDataflowValue> const &inputs,
-                           int num_outputs) override;
+                           nonnegative_int num_outputs) override;
 
   std::unordered_set<Node> query_nodes(NodeQuery const &) const override;
   std::unordered_set<OpenDataflowEdge>
diff --git a/lib/utils/include/utils/graph/render_dot.h b/lib/utils/include/utils/graph/render_dot.h
new file mode 100644
index 0000000000..632ba736ea
--- /dev/null
+++ b/lib/utils/include/utils/graph/render_dot.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_RENDER_DOT_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_RENDER_DOT_H
+
+#include "utils/graph/labelled_open_dataflow_graph/labelled_open_dataflow_graph_view.h"
+#include <string>
+#include <unordered_map>
+
+namespace FlexFlow {
+
+std::string escape_dot_string(std::string const &);
+std::string render_dot_node_attrs(
+    std::unordered_map<std::string, std::string> const &attrs);
+std::string render_dot(
+    LabelledDataflowGraphView<std::unordered_map<std::string, std::string>,
+                              std::string> const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/nonnegative_int/ceildiv.h b/lib/utils/include/utils/nonnegative_int/ceildiv.h
new file mode 100644
index 0000000000..939ea3de51
--- /dev/null
+++ b/lib/utils/include/utils/nonnegative_int/ceildiv.h
@@ -0,0 +1,11 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_CEILDIV_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_CEILDIV_H
+
+#include "utils/nonnegative_int/nonnegative_int.h"
+namespace FlexFlow {
+
+nonnegative_int ceildiv(nonnegative_int numerator, nonnegative_int denominator);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h
index 0749497c56..0bcc8cfd6f 100644
--- a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h
+++ b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h
@@ -1,12 +1,11 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_INT_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_INT_H
 
-#include "rapidcheck.h"
-
 #include <any>
 #include <fmt/format.h>
 #include <functional>
 #include <nlohmann/json.hpp>
+#include <rapidcheck.h>
 #include <string>
 
 namespace FlexFlow {
@@ -14,6 +13,7 @@ class nonnegative_int {
 public:
   nonnegative_int() = delete;
   explicit nonnegative_int(int value);
+  explicit nonnegative_int(size_t value);
 
   explicit operator int() const noexcept;
 
@@ -39,16 +39,31 @@ class nonnegative_int {
   friend bool operator>=(int const &lhs, nonnegative_int const &rhs);
 
   nonnegative_int operator+(nonnegative_int const &other) const;
+  nonnegative_int &operator++();
+  nonnegative_int operator++(int);
+  nonnegative_int &operator+=(nonnegative_int const &other);
+
+  nonnegative_int operator*(nonnegative_int const &other) const;
+  nonnegative_int &operator*=(nonnegative_int const &other);
+
+  nonnegative_int operator/(nonnegative_int const &other) const;
+  nonnegative_int &operator/=(nonnegative_int const &other);
+
+  nonnegative_int operator%(nonnegative_int const &other) const;
+  nonnegative_int &operator%=(nonnegative_int const &other);
 
   friend std::ostream &operator<<(std::ostream &os, nonnegative_int const &n);
 
   friend int format_as(nonnegative_int const &);
 
-  int get_value() const;
+  int unwrap_nonnegative() const;
 
 private:
   int value_;
 };
+
+nonnegative_int operator""_n(unsigned long long int);
+
 } // namespace FlexFlow
 
 namespace nlohmann {
@@ -59,6 +74,13 @@ struct adl_serializer<::FlexFlow::nonnegative_int> {
 };
 } // namespace nlohmann
 
+namespace rc {
+template <>
+struct Arbitrary<::FlexFlow::nonnegative_int> {
+  static Gen<::FlexFlow::nonnegative_int> arbitrary();
+};
+} // namespace rc
+
 namespace std {
 template <>
 struct hash<::FlexFlow::nonnegative_int> {
diff --git a/lib/utils/include/utils/nonnegative_int/nonnegative_range.h b/lib/utils/include/utils/nonnegative_int/nonnegative_range.h
new file mode 100644
index 0000000000..af323aef42
--- /dev/null
+++ b/lib/utils/include/utils/nonnegative_int/nonnegative_range.h
@@ -0,0 +1,14 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_RANGE_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_RANGE_H
+
+#include "utils/nonnegative_int/nonnegative_int.h"
+
+namespace FlexFlow {
+
+std::vector<nonnegative_int> nonnegative_range(nonnegative_int end);
+std::vector<nonnegative_int>
+    nonnegative_range(nonnegative_int start, nonnegative_int end, int step = 1);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/nonnegative_int/num_elements.h b/lib/utils/include/utils/nonnegative_int/num_elements.h
new file mode 100644
index 0000000000..57bc98ee50
--- /dev/null
+++ b/lib/utils/include/utils/nonnegative_int/num_elements.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NUM_ELEMENTS_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NUM_ELEMENTS_H
+
+#include "utils/exception.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
+
+namespace FlexFlow {
+
+template <typename T>
+nonnegative_int num_elements(T const &t) {
+  size_t t_size = t.size();
+  return nonnegative_int{t_size};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/variant.h b/lib/utils/include/utils/variant.h
index 241d631200..75a8851362 100644
--- a/lib/utils/include/utils/variant.h
+++ b/lib/utils/include/utils/variant.h
@@ -4,6 +4,7 @@
 #include "rapidcheck.h"
 #include "utils/type_traits.h"
 #include <optional>
+#include <utility>
 #include <variant>
 
 namespace FlexFlow {
diff --git a/lib/utils/src/utils/bidict/algorithms/bidict_from_enumerating.cc b/lib/utils/src/utils/bidict/algorithms/bidict_from_enumerating.cc
index 350f08600c..67e0b32d6e 100644
--- a/lib/utils/src/utils/bidict/algorithms/bidict_from_enumerating.cc
+++ b/lib/utils/src/utils/bidict/algorithms/bidict_from_enumerating.cc
@@ -1 +1,14 @@
 #include "utils/bidict/algorithms/bidict_from_enumerating.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template bidict<nonnegative_int, T>
+    bidict_from_enumerating(std::unordered_set<T> const &);
+
+template bidict<nonnegative_int, T>
+    bidict_from_enumerating(std::set<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/cli/cli_parse.cc b/lib/utils/src/utils/cli/cli_parse.cc
index 07982c0c2d..36d5837f9c 100644
--- a/lib/utils/src/utils/cli/cli_parse.cc
+++ b/lib/utils/src/utils/cli/cli_parse.cc
@@ -32,7 +32,7 @@ tl::expected<CLIParseResult, std::string>
       {},
   };
 
-  int consumed_positional_args = 0;
+  nonnegative_int consumed_positional_args = 0_n;
   auto parse_positional_arg =
       [&](std::string const &arg) -> std::optional<std::string> {
     if (consumed_positional_args >= cli.positional_arguments.size()) {
@@ -40,8 +40,8 @@ tl::expected<CLIParseResult, std::string>
                          cli.positional_arguments.size());
     }
 
-    CLIPositionalArgumentSpec arg_spec =
-        cli.positional_arguments.at(consumed_positional_args);
+    CLIPositionalArgumentSpec arg_spec = cli.positional_arguments.at(
+        consumed_positional_args.unwrap_nonnegative());
 
     if (arg_spec.choices.has_value() &&
         !contains(arg_spec.choices.value(), arg)) {
diff --git a/lib/utils/src/utils/cli/cli_spec.cc b/lib/utils/src/utils/cli/cli_spec.cc
index ca51cfe57f..e314f6fd55 100644
--- a/lib/utils/src/utils/cli/cli_spec.cc
+++ b/lib/utils/src/utils/cli/cli_spec.cc
@@ -2,6 +2,8 @@
 #include "utils/containers/count.h"
 #include "utils/containers/transform.h"
 #include "utils/integer_conversions.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
@@ -10,8 +12,8 @@ CLISpec empty_cli_spec() {
 }
 
 std::vector<CLIFlagKey> cli_get_flag_keys(CLISpec const &cli) {
-  return transform(count(cli.flags.size()),
-                   [](int idx) { return CLIFlagKey{idx}; });
+  return transform(nonnegative_range(num_elements(cli.flags)),
+                   [](nonnegative_int idx) { return CLIFlagKey{idx}; });
 }
 
 CLIArgumentKey cli_add_help_flag(CLISpec &cli) {
@@ -21,17 +23,18 @@ CLIArgumentKey cli_add_help_flag(CLISpec &cli) {
 }
 
 CLIArgumentKey cli_add_flag(CLISpec &cli, CLIFlagSpec const &flag_spec) {
+  CLIArgumentKey key = CLIArgumentKey{CLIFlagKey{num_elements(cli.flags)}};
   cli.flags.push_back(flag_spec);
-
-  return CLIArgumentKey{CLIFlagKey{int_from_size_t(cli.flags.size()) - 1}};
+  return key;
 }
 
 CLIArgumentKey
     cli_add_positional_argument(CLISpec &cli,
                                 CLIPositionalArgumentSpec const &arg) {
+  CLIArgumentKey key = CLIArgumentKey{
+      CLIPositionalArgumentKey{num_elements(cli.positional_arguments)}};
   cli.positional_arguments.push_back(arg);
-  return CLIArgumentKey{CLIPositionalArgumentKey{
-      int_from_size_t(cli.positional_arguments.size()) - 1}};
+  return key;
 }
 
 } // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/at_idx.cc b/lib/utils/src/utils/containers/at_idx.cc
index 45b1a31fce..14a0695c6d 100644
--- a/lib/utils/src/utils/containers/at_idx.cc
+++ b/lib/utils/src/utils/containers/at_idx.cc
@@ -1 +1,10 @@
 #include "utils/containers/at_idx.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using E = value_type<0>;
+
+template std::optional<E> at_idx(std::vector<E> const &, nonnegative_int);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/enumerate.cc b/lib/utils/src/utils/containers/enumerate.cc
index 0984b6dc63..ca5ad6ddc1 100644
--- a/lib/utils/src/utils/containers/enumerate.cc
+++ b/lib/utils/src/utils/containers/enumerate.cc
@@ -1 +1,12 @@
 #include "utils/containers/enumerate.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template std::map<nonnegative_int, T> enumerate(std::vector<T> const &);
+
+template std::map<nonnegative_int, T> enumerate(std::unordered_set<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/enumerate_vector.cc b/lib/utils/src/utils/containers/enumerate_vector.cc
index d4fd131af2..0d0bd1c277 100644
--- a/lib/utils/src/utils/containers/enumerate_vector.cc
+++ b/lib/utils/src/utils/containers/enumerate_vector.cc
@@ -1 +1,10 @@
 #include "utils/containers/enumerate_vector.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template std::map<nonnegative_int, T> enumerate_vector(std::vector<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/make.cc b/lib/utils/src/utils/containers/make.cc
new file mode 100644
index 0000000000..29b5bc5184
--- /dev/null
+++ b/lib/utils/src/utils/containers/make.cc
@@ -0,0 +1,8 @@
+#include "utils/containers/make.h"
+#include <vector>
+
+namespace FlexFlow {
+
+template decltype(auto) make<std::vector<int>>();
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/range.cc b/lib/utils/src/utils/containers/range.cc
index d3ebd1063b..f3baab3db1 100644
--- a/lib/utils/src/utils/containers/range.cc
+++ b/lib/utils/src/utils/containers/range.cc
@@ -1,5 +1,6 @@
 #include "utils/containers/range.h"
 #include <cassert>
+#include <iostream>
 
 namespace FlexFlow {
 
diff --git a/lib/utils/src/utils/containers/repeat.cc b/lib/utils/src/utils/containers/repeat.cc
index 76e46f0fdc..777996d995 100644
--- a/lib/utils/src/utils/containers/repeat.cc
+++ b/lib/utils/src/utils/containers/repeat.cc
@@ -1 +1,11 @@
 #include "utils/containers/repeat.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using Out = value_type<0>;
+using F = std::function<Out()>;
+
+template std::vector<Out> repeat(nonnegative_int, F const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/repeat_element.cc b/lib/utils/src/utils/containers/repeat_element.cc
new file mode 100644
index 0000000000..70889eb971
--- /dev/null
+++ b/lib/utils/src/utils/containers/repeat_element.cc
@@ -0,0 +1,10 @@
+#include "utils/containers/repeat_element.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template std::vector<T> repeat_element(nonnegative_int, T const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/replicate.cc b/lib/utils/src/utils/containers/replicate.cc
deleted file mode 100644
index 2fb2f079f6..0000000000
--- a/lib/utils/src/utils/containers/replicate.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "utils/containers/replicate.h"
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms.cc
index f0e52d6fc2..7069146057 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms.cc
@@ -27,7 +27,7 @@ std::vector<DataflowOutput> get_outputs(DataflowGraphView const &g,
                                         Node const &n) {
   return sorted_by(g.query_outputs(DataflowOutputQuery{
                        query_set<Node>{n},
-                       query_set<int>::matchall(),
+                       query_set<nonnegative_int>::matchall(),
                    }),
                    [](DataflowOutput const &l, DataflowOutput const &r) {
                      return l.idx < r.idx;
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/as_dot.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/as_dot.cc
index 47c30ce998..2ae903fa0b 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/as_dot.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/as_dot.cc
@@ -1,27 +1,36 @@
 #include "utils/graph/dataflow_graph/algorithms/as_dot.h"
+#include "utils/containers/generate_map.h"
+#include "utils/containers/map_keys.h"
 #include "utils/dot_file.h"
 #include "utils/graph/dataflow_graph/algorithms.h"
+#include "utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h"
+#include "utils/graph/labelled_open_dataflow_graph/algorithms/with_labelling.h"
 #include "utils/graph/node/algorithms.h"
+#include "utils/graph/render_dot.h"
 #include "utils/record_formatter.h"
 
 namespace FlexFlow {
 
-// WARN(@lockshaw): doing this all with string ids is ugly and error prone,
-// as it requires duplicating the stringification logic across functions.
-//
-// Fixing this is tracked in issue
 std::string as_dot(DataflowGraphView const &g) {
-  std::ostringstream oss;
-  DotFile<std::string> dot = DotFile<std::string>{oss};
+  auto get_node_attrs = [](Node const &) {
+    return std::unordered_map<std::string, std::string>{};
+  };
+
+  std::unordered_map<Node, std::unordered_map<std::string, std::string>>
+      node_labels = generate_map(get_nodes(g), get_node_attrs);
 
-  std::function<std::string(Node const &)> get_node_label =
-      [](Node const &n) -> std::string {
-    return fmt::format("n{}", n.raw_uid);
+  auto get_output_label = [](DataflowOutput const &o) {
+    return fmt::to_string(o.idx);
   };
-  as_dot(dot, g, get_node_label);
 
-  dot.close();
-  return oss.str();
+  std::unordered_map<DataflowOutput, std::string> output_labels =
+      generate_map(get_all_dataflow_outputs(g), get_output_label);
+  std::unordered_map<OpenDataflowValue, std::string> value_labels =
+      map_keys(output_labels,
+               [](DataflowOutput const &o) { return OpenDataflowValue{o}; });
+
+  return render_dot(with_labelling(
+      view_as_open_dataflow_graph(g), node_labels, value_labels));
 }
 
 void as_dot(DotFile<std::string> &dot,
@@ -29,9 +38,13 @@ void as_dot(DotFile<std::string> &dot,
             std::function<std::string(Node const &)> const &get_node_label) {
   auto get_node_name = [](Node n) { return fmt::format("n{}", n.raw_uid); };
 
-  auto get_input_field = [](int idx) { return fmt::format("i{}", idx); };
+  auto get_input_field = [](nonnegative_int idx) {
+    return fmt::format("i{}", idx);
+  };
 
-  auto get_output_field = [](int idx) { return fmt::format("o{}", idx); };
+  auto get_output_field = [](nonnegative_int idx) {
+    return fmt::format("o{}", idx);
+  };
 
   for (Node const &n : get_nodes(g)) {
     std::vector<DataflowInput> n_inputs = get_dataflow_inputs(g, n);
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc
index c07d344d05..73afc11acc 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc
@@ -6,9 +6,9 @@ std::unordered_set<DataflowEdge> get_dataflow_edges_from_node_to_node(
     DataflowGraphView const &g, Node const &src, Node const &dst) {
   return g.query_edges(DataflowEdgeQuery{
       /*src_nodes=*/query_set<Node>{src},
-      /*src_idxs=*/query_set<int>::matchall(),
+      /*src_idxs=*/query_set<nonnegative_int>::matchall(),
       /*dst_nodes=*/query_set<Node>{dst},
-      /*dst_idxs=*/query_set<int>::matchall(),
+      /*dst_idxs=*/query_set<nonnegative_int>::matchall(),
   });
 }
 
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
index 9500836db1..c4947f967a 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
@@ -7,9 +7,9 @@ std::vector<DataflowEdge> get_incoming_edges(DataflowGraphView const &g,
                                              Node const &n) {
   return sorted_by(g.query_edges(DataflowEdgeQuery{
                        query_set<Node>::matchall(),
-                       query_set<int>::matchall(),
+                       query_set<nonnegative_int>::matchall(),
                        {n},
-                       query_set<int>::matchall(),
+                       query_set<nonnegative_int>::matchall(),
                    }),
                    [](DataflowEdge const &l, DataflowEdge const &r) {
                      return l.dst.idx < r.dst.idx;
@@ -21,9 +21,9 @@ std::unordered_set<DataflowEdge>
                        std::unordered_set<Node> const &ns) {
   DataflowEdgeQuery query = DataflowEdgeQuery{
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
       query_set<Node>{ns},
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
   return g.query_edges(query);
 }
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
index 2376e4897f..16b2b82b2d 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
@@ -7,9 +7,9 @@ std::unordered_set<DataflowEdge> get_outgoing_edges(DataflowGraphView const &g,
                                                     Node const &n) {
   return g.query_edges(DataflowEdgeQuery{
       {n},
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   });
 }
 
@@ -18,9 +18,9 @@ std::unordered_set<DataflowEdge>
                        std::unordered_set<Node> const &ns) {
   DataflowEdgeQuery query = DataflowEdgeQuery{
       query_set<Node>{ns},
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
   return g.query_edges(query);
 }
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
index d17a84dd12..a06ec1ab31 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
@@ -13,9 +13,9 @@ std::unordered_set<DataflowEdge>
 
   DataflowEdgeQuery query = DataflowEdgeQuery{
       src_query,
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
       query_set<Node>{ns},
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
 
   return g.query_edges(query);
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc
index c442a26dab..f94dd94e11 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc
@@ -13,9 +13,9 @@ std::unordered_set<DataflowEdge>
 
   DataflowEdgeQuery query = DataflowEdgeQuery{
       query_set<Node>{ns},
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
       dst_query,
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
 
   return g.query_edges(query);
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.cc
index 0fd0b85b71..703db4bf91 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.cc
@@ -3,16 +3,18 @@
 
 namespace FlexFlow {
 
-ViewDataflowGraphAsOpen::ViewDataflowGraphAsOpen(DataflowGraphView const &g)
+ViewDataflowGraphAsOpenDataflowGraph::ViewDataflowGraphAsOpenDataflowGraph(
+    DataflowGraphView const &g)
     : g(g) {}
 
-std::unordered_set<Node>
-    ViewDataflowGraphAsOpen::query_nodes(NodeQuery const &q) const {
+std::unordered_set<Node> ViewDataflowGraphAsOpenDataflowGraph::query_nodes(
+    NodeQuery const &q) const {
   return this->g.query_nodes(q);
 }
 
 std::unordered_set<OpenDataflowEdge>
-    ViewDataflowGraphAsOpen::query_edges(OpenDataflowEdgeQuery const &q) const {
+    ViewDataflowGraphAsOpenDataflowGraph::query_edges(
+        OpenDataflowEdgeQuery const &q) const {
   std::unordered_set<DataflowEdge> closed_edges =
       this->g.query_edges(q.standard_edge_query);
 
@@ -21,21 +23,23 @@ std::unordered_set<OpenDataflowEdge>
 }
 
 std::unordered_set<DataflowOutput>
-    ViewDataflowGraphAsOpen::query_outputs(DataflowOutputQuery const &q) const {
+    ViewDataflowGraphAsOpenDataflowGraph::query_outputs(
+        DataflowOutputQuery const &q) const {
   return this->g.query_outputs(q);
 }
 
 std::unordered_set<DataflowGraphInput>
-    ViewDataflowGraphAsOpen::get_inputs() const {
+    ViewDataflowGraphAsOpenDataflowGraph::get_inputs() const {
   return {};
 }
 
-ViewDataflowGraphAsOpen *ViewDataflowGraphAsOpen::clone() const {
-  return new ViewDataflowGraphAsOpen{this->g};
+ViewDataflowGraphAsOpenDataflowGraph *
+    ViewDataflowGraphAsOpenDataflowGraph::clone() const {
+  return new ViewDataflowGraphAsOpenDataflowGraph{this->g};
 }
 
 OpenDataflowGraphView view_as_open_dataflow_graph(DataflowGraphView const &g) {
-  return OpenDataflowGraphView::create<ViewDataflowGraphAsOpen>(g);
+  return OpenDataflowGraphView::create<ViewDataflowGraphAsOpenDataflowGraph>(g);
 }
 
 } // namespace FlexFlow
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h b/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h
deleted file mode 100644
index bec9d0e019..0000000000
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef _FLEXFLOW_LIB_UTILS_SRC_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_VIEW_AS_OPEN_DATAFLOW_GRAPH_H
-#define _FLEXFLOW_LIB_UTILS_SRC_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_VIEW_AS_OPEN_DATAFLOW_GRAPH_H
-
-#include "utils/graph/open_dataflow_graph/open_dataflow_graph_view.h"
-
-namespace FlexFlow {
-
-struct ViewDataflowGraphAsOpen final : public IOpenDataflowGraphView {
-public:
-  ViewDataflowGraphAsOpen() = delete;
-  ViewDataflowGraphAsOpen(DataflowGraphView const &);
-
-  std::unordered_set<Node> query_nodes(NodeQuery const &) const override;
-  std::unordered_set<OpenDataflowEdge>
-      query_edges(OpenDataflowEdgeQuery const &) const override;
-  std::unordered_set<DataflowOutput>
-      query_outputs(DataflowOutputQuery const &) const override;
-  std::unordered_set<DataflowGraphInput> get_inputs() const override;
-
-  ViewDataflowGraphAsOpen *clone() const override;
-
-  ~ViewDataflowGraphAsOpen() = default;
-
-private:
-  DataflowGraphView g;
-};
-
-OpenDataflowGraphView view_as_open_dataflow_graph(DataflowGraphView const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/utils/src/utils/graph/dataflow_graph/dataflow_edge_query.cc b/lib/utils/src/utils/graph/dataflow_graph/dataflow_edge_query.cc
index 2196f7a028..982969f3a5 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/dataflow_edge_query.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/dataflow_edge_query.cc
@@ -5,18 +5,18 @@ namespace FlexFlow {
 DataflowEdgeQuery dataflow_edge_query_all() {
   return DataflowEdgeQuery{
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
 }
 
 DataflowEdgeQuery dataflow_edge_query_none() {
   return DataflowEdgeQuery{
       query_set<Node>::match_none(),
-      query_set<int>::match_none(),
+      query_set<nonnegative_int>::match_none(),
       query_set<Node>::match_none(),
-      query_set<int>::match_none(),
+      query_set<nonnegative_int>::match_none(),
   };
 }
 
@@ -30,9 +30,9 @@ bool dataflow_edge_query_includes_dataflow_edge(DataflowEdgeQuery const &q,
 DataflowEdgeQuery dataflow_edge_query_for_edge(DataflowEdge const &e) {
   return DataflowEdgeQuery{
       query_set<Node>{e.src.node},
-      query_set<int>{e.src.idx},
+      query_set<nonnegative_int>{e.src.idx},
       query_set<Node>{e.dst.node},
-      query_set<int>{e.dst.idx},
+      query_set<nonnegative_int>{e.dst.idx},
   };
 }
 
@@ -40,9 +40,9 @@ DataflowEdgeQuery
     dataflow_edge_query_all_outgoing_from(DataflowOutput const &src) {
   return DataflowEdgeQuery{
       query_set<Node>{src.node},
-      query_set<int>{src.idx},
+      query_set<nonnegative_int>{src.idx},
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
 }
 
@@ -50,9 +50,9 @@ DataflowEdgeQuery
     dataflow_edge_query_all_incoming_to(DataflowInput const &dst) {
   return DataflowEdgeQuery{
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
       query_set<Node>{dst.node},
-      query_set<int>{dst.idx},
+      query_set<nonnegative_int>{dst.idx},
   };
 }
 
diff --git a/lib/utils/src/utils/graph/dataflow_graph/dataflow_graph.cc b/lib/utils/src/utils/graph/dataflow_graph/dataflow_graph.cc
index 868dd61c6d..8ed36135e1 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/dataflow_graph.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/dataflow_graph.cc
@@ -4,7 +4,7 @@ namespace FlexFlow {
 
 NodeAddedResult
     DataflowGraph::add_node(std::vector<DataflowOutput> const &inputs,
-                            int num_outputs) {
+                            nonnegative_int num_outputs) {
   return this->get_interface().add_node(inputs, num_outputs);
 }
 
diff --git a/lib/utils/src/utils/graph/dataflow_graph/dataflow_output_query.cc b/lib/utils/src/utils/graph/dataflow_graph/dataflow_output_query.cc
index 64df4c77f2..ceaad2bfdf 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/dataflow_output_query.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/dataflow_output_query.cc
@@ -5,14 +5,14 @@ namespace FlexFlow {
 DataflowOutputQuery dataflow_output_query_all() {
   return DataflowOutputQuery{
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
 }
 
 DataflowOutputQuery dataflow_output_query_none() {
   return DataflowOutputQuery{
       query_set<Node>::match_none(),
-      query_set<int>::match_none(),
+      query_set<nonnegative_int>::match_none(),
   };
 }
 
@@ -24,7 +24,7 @@ bool dataflow_output_query_includes_dataflow_output(
 DataflowOutputQuery dataflow_output_query_for_output(DataflowOutput const &o) {
   return DataflowOutputQuery{
       query_set<Node>{o.node},
-      query_set<int>{o.idx},
+      query_set<nonnegative_int>{o.idx},
   };
 }
 
diff --git a/lib/utils/src/utils/graph/dataflow_graph/i_dataflow_graph_view.cc b/lib/utils/src/utils/graph/dataflow_graph/i_dataflow_graph_view.cc
index 300b5de546..ef9412b939 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/i_dataflow_graph_view.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/i_dataflow_graph_view.cc
@@ -7,9 +7,9 @@ std::unordered_set<DirectedEdge>
     IDataflowGraphView::query_edges(DirectedEdgeQuery const &q) const {
   DataflowEdgeQuery dataflow_query = DataflowEdgeQuery{
       q.srcs,
-      matchall<int>(),
+      matchall<nonnegative_int>(),
       q.dsts,
-      matchall<int>(),
+      matchall<nonnegative_int>(),
   };
   std::unordered_set<DataflowEdge> dataflow_edges =
       this->query_edges(dataflow_query);
diff --git a/lib/utils/src/utils/graph/digraph/algorithms/transitive_closure.cc b/lib/utils/src/utils/graph/digraph/algorithms/transitive_closure.cc
index 3efea1c138..2de3056068 100644
--- a/lib/utils/src/utils/graph/digraph/algorithms/transitive_closure.cc
+++ b/lib/utils/src/utils/graph/digraph/algorithms/transitive_closure.cc
@@ -6,6 +6,7 @@
 #include "utils/graph/digraph/algorithms/materialize_digraph_view.h"
 #include "utils/graph/instances/adjacency_digraph.h"
 #include "utils/graph/node/algorithms.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
@@ -15,7 +16,9 @@ DiGraphView transitive_closure(DiGraphView const &g) {
   // incredibly slow (> minutes) for even moderately sized graphs
   // (i.e., 200 nodes) without optimization enabled.
 
-  bidict<int, Node> nodes = bidict_from_enumerating(get_nodes(g));
+  bidict<int, Node> nodes =
+      map_keys(bidict_from_enumerating(get_nodes(g)),
+               [](nonnegative_int x) { return x.unwrap_nonnegative(); });
   std::unordered_set<DirectedEdge> edges = get_edges(g);
 
   int num_nodes = nodes.size();
diff --git a/lib/utils/src/utils/graph/digraph/algorithms/transitive_reduction.cc b/lib/utils/src/utils/graph/digraph/algorithms/transitive_reduction.cc
index 97a2439263..69b24b716c 100644
--- a/lib/utils/src/utils/graph/digraph/algorithms/transitive_reduction.cc
+++ b/lib/utils/src/utils/graph/digraph/algorithms/transitive_reduction.cc
@@ -37,7 +37,9 @@ DiGraphView transitive_reduction(DiGraphView const &g) {
   // transitive_closure inlined to avoid any drifts in node numbering
   // between transitive_closure and transitive_reduction
 
-  bidict<int, Node> nodes = bidict_from_enumerating(get_nodes(g));
+  bidict<int, Node> nodes =
+      map_keys(bidict_from_enumerating(get_nodes(g)),
+               [](nonnegative_int x) { return x.unwrap_nonnegative(); });
   int num_nodes = nodes.size();
 
   std::vector<bool> edge_matrix(num_nodes * num_nodes, false);
diff --git a/lib/utils/src/utils/graph/instances/unordered_set_dataflow_graph.cc b/lib/utils/src/utils/graph/instances/unordered_set_dataflow_graph.cc
index 1ffc5f423f..a5a1fb82bf 100644
--- a/lib/utils/src/utils/graph/instances/unordered_set_dataflow_graph.cc
+++ b/lib/utils/src/utils/graph/instances/unordered_set_dataflow_graph.cc
@@ -1,6 +1,5 @@
 #include "utils/graph/instances/unordered_set_dataflow_graph.h"
 #include "utils/containers/are_disjoint.h"
-#include "utils/containers/count.h"
 #include "utils/containers/enumerate_vector.h"
 #include "utils/containers/extend.h"
 #include "utils/containers/transform.h"
@@ -9,6 +8,7 @@
 #include "utils/graph/node/algorithms.h"
 #include "utils/graph/open_dataflow_graph/open_dataflow_edge.h"
 #include "utils/graph/open_dataflow_graph/open_dataflow_edge_query.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 
 namespace FlexFlow {
 
@@ -25,18 +25,18 @@ UnorderedSetDataflowGraph::UnorderedSetDataflowGraph(
 }
 
 NodeAddedResult UnorderedSetDataflowGraph::add_node(
-    std::vector<DataflowOutput> const &inputs, int num_outputs) {
+    std::vector<DataflowOutput> const &inputs, nonnegative_int num_outputs) {
   std::vector<OpenDataflowValue> open_inputs = transform(
       inputs, [](DataflowOutput const &o) { return OpenDataflowValue{o}; });
   return this->add_node(open_inputs, num_outputs);
 }
 
 NodeAddedResult UnorderedSetDataflowGraph::add_node(
-    std::vector<OpenDataflowValue> const &inputs, int num_outputs) {
+    std::vector<OpenDataflowValue> const &inputs, nonnegative_int num_outputs) {
   Node new_node = this->node_source.new_node();
 
-  std::vector<DataflowOutput> new_outputs =
-      transform(count(num_outputs), [&](int output_idx) {
+  std::vector<DataflowOutput> new_outputs = transform(
+      nonnegative_range(num_outputs), [&](nonnegative_int output_idx) {
         return DataflowOutput{new_node, output_idx};
       });
 
diff --git a/lib/utils/src/utils/graph/multidigraph/algorithms/add_nodes.cc b/lib/utils/src/utils/graph/multidigraph/algorithms/add_nodes.cc
index a404423284..fd4a8782a4 100644
--- a/lib/utils/src/utils/graph/multidigraph/algorithms/add_nodes.cc
+++ b/lib/utils/src/utils/graph/multidigraph/algorithms/add_nodes.cc
@@ -3,7 +3,7 @@
 
 namespace FlexFlow {
 
-std::vector<Node> add_nodes(MultiDiGraph &g, int num_nodes) {
+std::vector<Node> add_nodes(MultiDiGraph &g, nonnegative_int num_nodes) {
   return repeat(num_nodes, [&]() { return g.add_node(); });
 }
 
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.cc
new file mode 100644
index 0000000000..f7f8a9fd34
--- /dev/null
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.cc
@@ -0,0 +1,11 @@
+#include "utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h"
+#include "utils/graph/open_dataflow_graph/algorithms/find_isomorphism.h"
+
+namespace FlexFlow {
+
+bool are_isomorphic(OpenDataflowGraphView const &src,
+                    OpenDataflowGraphView const &dst) {
+  return find_isomorphism(src, dst).has_value();
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc
index 9077ea5f9a..261de287a9 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc
@@ -2,13 +2,16 @@
 #include "utils/dot_file.h"
 #include "utils/graph/dataflow_graph/algorithms.h"
 #include "utils/graph/dataflow_graph/algorithms/as_dot.h"
+#include "utils/graph/labelled_dataflow_graph/labelled_dataflow_graph.h"
 #include "utils/graph/node/algorithms.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_inputs.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.h"
 
 namespace FlexFlow {
 
 std::string as_dot(OpenDataflowGraphView const &g) {
+
   std::function<std::string(Node const &)> get_node_label = [](Node const &n) {
     return fmt::format("n{}", n.raw_uid);
   };
@@ -36,9 +39,13 @@ std::string
 
   auto get_node_name = [](Node n) { return fmt::format("n{}", n.raw_uid); };
 
-  auto get_input_field = [](int idx) { return fmt::format("i{}", idx); };
+  auto get_input_field = [](nonnegative_int idx) {
+    return fmt::format("i{}", idx);
+  };
 
-  auto get_output_field = [](int idx) { return fmt::format("o{}", idx); };
+  auto get_output_field = [](nonnegative_int idx) {
+    return fmt::format("o{}", idx);
+  };
 
   auto get_graph_input_name = [](DataflowGraphInput i) {
     return fmt::format("gi{}", i.idx);
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.cc
index cad00c71e1..728dc75678 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.cc
@@ -27,13 +27,13 @@ std::vector<OpenDataflowEdge> get_incoming_edges(OpenDataflowGraphView const &g,
                        DataflowInputEdgeQuery{
                            query_set<DataflowGraphInput>::matchall(),
                            {n},
-                           query_set<int>::matchall(),
+                           query_set<nonnegative_int>::matchall(),
                        },
                        DataflowEdgeQuery{
                            query_set<Node>::matchall(),
-                           query_set<int>::matchall(),
+                           query_set<nonnegative_int>::matchall(),
                            {n},
-                           query_set<int>::matchall(),
+                           query_set<nonnegative_int>::matchall(),
                        },
                    }),
                    [](OpenDataflowEdge const &l, OpenDataflowEdge const &r) {
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph_incoming_edges.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
index 95a8e095fc..6448da9c73 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
@@ -13,13 +13,13 @@ std::unordered_set<OpenDataflowEdge>
       DataflowInputEdgeQuery{
           query_set<DataflowGraphInput>::matchall(),
           query_set<Node>{ns},
-          query_set<int>::matchall(),
+          query_set<nonnegative_int>::matchall(),
       },
       DataflowEdgeQuery{
           query_set<Node>{nodes_not_in_ns},
-          query_set<int>::matchall(),
+          query_set<nonnegative_int>::matchall(),
           query_set<Node>{ns},
-          query_set<int>::matchall(),
+          query_set<nonnegative_int>::matchall(),
       },
   };
 
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.cc
new file mode 100644
index 0000000000..c55c4fe360
--- /dev/null
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.cc
@@ -0,0 +1,54 @@
+#include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+OpenDataflowValue isomorphism_map_r_open_dataflow_value_from_l(
+    OpenDataflowGraphIsomorphism const &iso, OpenDataflowValue const &l_value) {
+  return l_value.visit<OpenDataflowValue>(overload{
+      [&](DataflowGraphInput const &l_input) {
+        return OpenDataflowValue{
+            iso.input_mapping.at_l(l_input),
+        };
+      },
+      [&](DataflowOutput const &l_output) {
+        return OpenDataflowValue{
+            isomorphism_map_r_dataflow_output_from_l(iso, l_output),
+        };
+      },
+  });
+}
+
+OpenDataflowValue isomorphism_map_l_open_dataflow_value_from_r(
+    OpenDataflowGraphIsomorphism const &iso, OpenDataflowValue const &r_value) {
+  return r_value.visit<OpenDataflowValue>(overload{
+      [&](DataflowGraphInput const &r_input) {
+        return OpenDataflowValue{
+            iso.input_mapping.at_r(r_input),
+        };
+      },
+      [&](DataflowOutput const &r_output) {
+        return OpenDataflowValue{
+            isomorphism_map_l_dataflow_output_from_r(iso, r_output),
+        };
+      },
+  });
+}
+
+DataflowOutput isomorphism_map_r_dataflow_output_from_l(
+    OpenDataflowGraphIsomorphism const &iso, DataflowOutput const &l_output) {
+  return DataflowOutput{
+      iso.node_mapping.at_l(l_output.node),
+      l_output.idx,
+  };
+}
+
+DataflowOutput isomorphism_map_l_dataflow_output_from_r(
+    OpenDataflowGraphIsomorphism const &iso, DataflowOutput const &r_output) {
+  return DataflowOutput{
+      iso.node_mapping.at_r(r_output.node),
+      r_output.idx,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/dataflow_input_edge_query.cc b/lib/utils/src/utils/graph/open_dataflow_graph/dataflow_input_edge_query.cc
index 8736f2d157..34adea6b09 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/dataflow_input_edge_query.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/dataflow_input_edge_query.cc
@@ -6,14 +6,14 @@ DataflowInputEdgeQuery dataflow_input_edge_query_all() {
   return DataflowInputEdgeQuery{
       query_set<DataflowGraphInput>::matchall(),
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
 }
 DataflowInputEdgeQuery dataflow_input_edge_query_none() {
   return DataflowInputEdgeQuery{
       query_set<DataflowGraphInput>::match_none(),
       query_set<Node>::match_none(),
-      query_set<int>::match_none(),
+      query_set<nonnegative_int>::match_none(),
   };
 }
 
@@ -28,7 +28,7 @@ DataflowInputEdgeQuery
   return DataflowInputEdgeQuery{
       query_set<DataflowGraphInput>{e.src},
       query_set<Node>{e.dst.node},
-      query_set<int>{e.dst.idx},
+      query_set<nonnegative_int>{e.dst.idx},
   };
 }
 
@@ -37,7 +37,7 @@ DataflowInputEdgeQuery
   return DataflowInputEdgeQuery{
       query_set<DataflowGraphInput>{src},
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
 }
 
@@ -46,7 +46,7 @@ DataflowInputEdgeQuery
   return DataflowInputEdgeQuery{
       query_set<DataflowGraphInput>::matchall(),
       query_set<Node>{dst.node},
-      query_set<int>{dst.idx},
+      query_set<nonnegative_int>{dst.idx},
   };
 }
 
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_edge.cc b/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_edge.cc
index d5e5b614af..d51562a6c6 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_edge.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_edge.cc
@@ -7,7 +7,7 @@ Node get_open_dataflow_edge_dst_node(OpenDataflowEdge const &e) {
   return get_open_dataflow_edge_dst(e).node;
 }
 
-int get_open_dataflow_edge_dst_idx(OpenDataflowEdge const &e) {
+nonnegative_int get_open_dataflow_edge_dst_idx(OpenDataflowEdge const &e) {
   return get_open_dataflow_edge_dst(e).idx;
 }
 
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_graph.cc b/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_graph.cc
index 63222dd360..949f837665 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_graph.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_graph.cc
@@ -4,7 +4,7 @@ namespace FlexFlow {
 
 NodeAddedResult
     OpenDataflowGraph::add_node(std::vector<OpenDataflowValue> const &inputs,
-                                int num_outputs) {
+                                nonnegative_int num_outputs) {
   return this->get_interface().add_node(inputs, num_outputs);
 }
 
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.cc b/lib/utils/src/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.cc
index 0fdb2f408b..171b321c66 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.cc
@@ -18,7 +18,7 @@ UnorderedSetOpenDataflowGraph::UnorderedSetOpenDataflowGraph(
       outputs(outputs), graph_inputs(graph_inputs) {}
 
 NodeAddedResult UnorderedSetOpenDataflowGraph::add_node(
-    std::vector<OpenDataflowValue> const &inputs, int num_outputs) {
+    std::vector<OpenDataflowValue> const &inputs, nonnegative_int num_outputs) {
   NOT_IMPLEMENTED();
 }
 
diff --git a/lib/utils/src/utils/graph/render_dot.cc b/lib/utils/src/utils/graph/render_dot.cc
new file mode 100644
index 0000000000..8bdc001c80
--- /dev/null
+++ b/lib/utils/src/utils/graph/render_dot.cc
@@ -0,0 +1,90 @@
+#include "utils/graph/render_dot.h"
+#include "utils/containers/flatmap.h"
+#include "utils/containers/try_at.h"
+#include "utils/graph/dataflow_graph/algorithms.h"
+#include "utils/graph/node/algorithms.h"
+#include "utils/record_formatter.h"
+
+namespace FlexFlow {
+
+std::string escape_dot_string(std::string const &s) {
+  auto escape_dot_char = [](char c) -> std::string {
+    switch (c) {
+      case '\\':
+      case '"':
+        return std::string{'\\'} + c;
+      default:
+        return std::string{c};
+    }
+  };
+
+  return flatmap(s, escape_dot_char);
+}
+
+std::string render_dot_node_attrs(
+    std::unordered_map<std::string, std::string> const &node_attrs) {
+  std::ostringstream oss;
+  for (auto const &[k, v] : node_attrs) {
+    oss << fmt::format(
+        "\"{}\"=\"{}\",", escape_dot_string(k), escape_dot_string(v));
+  }
+  return oss.str();
+}
+
+std::string render_node_label(
+    LabelledDataflowGraphView<std::unordered_map<std::string, std::string>,
+                              std::string> const &g,
+    Node const &n) {
+  std::vector<DataflowInput> n_inputs = get_dataflow_inputs(g, n);
+  std::vector<DataflowOutput> n_outputs = get_outputs(g, n);
+
+  RecordFormatter inputs_record;
+  for (DataflowInput const &i : n_inputs) {
+    inputs_record << fmt::format("<i{}>{}", i.idx, i.idx);
+  }
+
+  RecordFormatter outputs_record;
+  for (DataflowOutput const &o : n_outputs) {
+    outputs_record << fmt::format("<o{}>{}", o.idx, g.at(o));
+  }
+
+  RecordFormatter rec;
+  rec << inputs_record
+      << try_at(g.at(n), std::string{"label"})
+             .value_or(fmt::to_string(n.raw_uid))
+      << outputs_record;
+
+  std::ostringstream oss;
+  oss << rec;
+  return oss.str();
+}
+
+std::string render_dot(
+    LabelledDataflowGraphView<std::unordered_map<std::string, std::string>,
+                              std::string> const &g) {
+  std::vector<std::string> lines;
+  lines.push_back("digraph {");
+
+  for (Node const &n : get_nodes(g)) {
+    std::unordered_map<std::string, std::string> node_attrs = g.at(n);
+    node_attrs.at("label") = render_node_label(g, n);
+    node_attrs["shape"] = "record";
+
+    lines.push_back(fmt::format(
+        "  n{} [{}];", n.raw_uid, render_dot_node_attrs(node_attrs)));
+  }
+
+  for (DataflowEdge const &e : get_edges(g)) {
+    lines.push_back(fmt::format("  n{}:o{} -> n{}:i{};",
+                                e.src.node.raw_uid,
+                                e.src.idx,
+                                e.dst.node.raw_uid,
+                                e.dst.idx));
+  }
+
+  lines.push_back("}");
+
+  return join_strings(lines, "\n");
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/nonnegative_int/ceildiv.cc b/lib/utils/src/utils/nonnegative_int/ceildiv.cc
new file mode 100644
index 0000000000..f1115b25b5
--- /dev/null
+++ b/lib/utils/src/utils/nonnegative_int/ceildiv.cc
@@ -0,0 +1,20 @@
+#include "utils/nonnegative_int/ceildiv.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+nonnegative_int ceildiv(nonnegative_int numerator,
+                        nonnegative_int denominator) {
+  if (denominator == 0) {
+    throw mk_runtime_error(fmt::format(
+        "ceildiv expected denominator != 0, but received {}", denominator));
+  }
+
+  int n = numerator.unwrap_nonnegative();
+  int d = denominator.unwrap_nonnegative();
+
+  int result = (n + d - 1) / d;
+  return nonnegative_int{result};
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc
index 9088cc4bf9..e86c242250 100644
--- a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc
+++ b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc
@@ -1,4 +1,5 @@
 #include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
@@ -10,6 +11,15 @@ nonnegative_int::nonnegative_int(int value) {
   this->value_ = value;
 }
 
+nonnegative_int::nonnegative_int(size_t value) {
+  if (value > std::numeric_limits<int>::max()) {
+    throw std::invalid_argument(fmt::format(
+        "Input {} to nonnegative_int(size_t) is out-of-bounds for int", value));
+  }
+  this->value_ = static_cast<int>(value);
+  assert(this->value_ >= 0);
+}
+
 nonnegative_int::operator int() const noexcept {
   return this->value_;
 }
@@ -75,18 +85,72 @@ nonnegative_int nonnegative_int::operator+(nonnegative_int const &other) const {
   return nonnegative_int{this->value_ + other.value_};
 }
 
+nonnegative_int &nonnegative_int::operator++() {
+  this->value_++;
+  return *this;
+}
+
+nonnegative_int nonnegative_int::operator++(int) {
+  nonnegative_int result = *this;
+  this->value_++;
+  return result;
+}
+
+nonnegative_int &nonnegative_int::operator+=(nonnegative_int const &other) {
+  this->value_ += other.value_;
+  return *this;
+}
+
+nonnegative_int nonnegative_int::operator*(nonnegative_int const &other) const {
+  return nonnegative_int{this->value_ * other.value_};
+}
+
+nonnegative_int &nonnegative_int::operator*=(nonnegative_int const &other) {
+  this->value_ *= other.value_;
+  return *this;
+}
+
+nonnegative_int nonnegative_int::operator/(nonnegative_int const &other) const {
+  return nonnegative_int{this->value_ / other.value_};
+}
+
+nonnegative_int &nonnegative_int::operator/=(nonnegative_int const &other) {
+  this->value_ /= other.value_;
+  return *this;
+}
+
+nonnegative_int nonnegative_int::operator%(nonnegative_int const &other) const {
+  return nonnegative_int{this->value_ % other.value_};
+}
+
+nonnegative_int &nonnegative_int::operator%=(nonnegative_int const &other) {
+  this->value_ %= other.value_;
+  return *this;
+}
+
 std::ostream &operator<<(std::ostream &os, nonnegative_int const &n) {
   os << n.value_;
   return os;
 }
 
-int nonnegative_int::get_value() const {
+int nonnegative_int::unwrap_nonnegative() const {
   return this->value_;
 }
 
 int format_as(nonnegative_int const &x) {
-  return x.get_value();
+  return x.unwrap_nonnegative();
 }
+
+nonnegative_int operator""_n(unsigned long long int x) {
+  if (x >
+      static_cast<unsigned long long int>(std::numeric_limits<int>::max())) {
+    throw mk_runtime_error(
+        fmt::format("Value too large to wrap as nonnegative_int: {}", x));
+  }
+
+  return nonnegative_int{static_cast<int>(x)};
+}
+
 } // namespace FlexFlow
 
 namespace nlohmann {
@@ -97,13 +161,20 @@ ::FlexFlow::nonnegative_int
 
 void adl_serializer<::FlexFlow::nonnegative_int>::to_json(
     json &j, ::FlexFlow::nonnegative_int t) {
-  j = t.get_value();
+  j = t.unwrap_nonnegative();
 }
 } // namespace nlohmann
 
+namespace rc {
+Gen<::FlexFlow::nonnegative_int>
+    Arbitrary<::FlexFlow::nonnegative_int>::arbitrary() {
+  return gen::construct<::FlexFlow::nonnegative_int>(gen::nonNegative<int>());
+}
+} // namespace rc
+
 namespace std {
 std::size_t hash<::FlexFlow::nonnegative_int>::operator()(
     FlexFlow::nonnegative_int const &n) const noexcept {
-  return std::hash<int>{}(n.get_value());
+  return std::hash<int>{}(n.unwrap_nonnegative());
 }
 } // namespace std
diff --git a/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc b/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc
new file mode 100644
index 0000000000..f31db6d589
--- /dev/null
+++ b/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc
@@ -0,0 +1,19 @@
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/containers/range.h"
+#include "utils/containers/transform.h"
+
+namespace FlexFlow {
+
+std::vector<nonnegative_int> nonnegative_range(nonnegative_int end) {
+  return transform(range(end.unwrap_nonnegative()),
+                   [](int x) { return nonnegative_int{x}; });
+}
+
+std::vector<nonnegative_int>
+    nonnegative_range(nonnegative_int start, nonnegative_int end, int step) {
+  return transform(
+      range(start.unwrap_nonnegative(), end.unwrap_nonnegative(), step),
+      [](int x) { return nonnegative_int{x}; });
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/nonnegative_int/num_elements.cc b/lib/utils/src/utils/nonnegative_int/num_elements.cc
new file mode 100644
index 0000000000..21292bf2ab
--- /dev/null
+++ b/lib/utils/src/utils/nonnegative_int/num_elements.cc
@@ -0,0 +1,10 @@
+#include "utils/nonnegative_int/num_elements.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using E = value_type<0>;
+
+template nonnegative_int num_elements(std::vector<E> const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/test/src/main.cc b/lib/utils/test/src/main.cc
deleted file mode 100644
index 9522fa7fdb..0000000000
--- a/lib/utils/test/src/main.cc
+++ /dev/null
@@ -1,2 +0,0 @@
-#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
-#include "doctest/doctest.h"
diff --git a/lib/utils/test/src/utils/bidict/algorithms/bidict_from_enumerating.cc b/lib/utils/test/src/utils/bidict/algorithms/bidict_from_enumerating.cc
index b5a373e5c9..a669869fb8 100644
--- a/lib/utils/test/src/utils/bidict/algorithms/bidict_from_enumerating.cc
+++ b/lib/utils/test/src/utils/bidict/algorithms/bidict_from_enumerating.cc
@@ -10,10 +10,12 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("bidict_from_enumerating(std::unordered_set<T>)") {
     std::unordered_set<std::string> input = {"zero", "one", "two"};
 
-    bidict<int, std::string> result = bidict_from_enumerating(input);
+    bidict<nonnegative_int, std::string> result =
+        bidict_from_enumerating(input);
 
-    std::unordered_set<int> result_left_entries = left_entries(result);
-    std::unordered_set<int> correct_left_entries = {0, 1, 2};
+    std::unordered_set<nonnegative_int> result_left_entries =
+        left_entries(result);
+    std::unordered_set<nonnegative_int> correct_left_entries = {0_n, 1_n, 2_n};
     CHECK(result_left_entries == correct_left_entries);
 
     std::unordered_set<std::string> result_right_entries =
@@ -25,13 +27,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("bidict_from_enumerating(std::set<T>)") {
     std::set<std::string> input = {"a", "c", "b"};
 
-    bidict<int, std::string> correct = {
-        {0, "a"},
-        {1, "b"},
-        {2, "c"},
+    bidict<nonnegative_int, std::string> correct = {
+        {0_n, "a"},
+        {1_n, "b"},
+        {2_n, "c"},
     };
 
-    bidict<int, std::string> result = bidict_from_enumerating(input);
+    bidict<nonnegative_int, std::string> result =
+        bidict_from_enumerating(input);
 
     CHECK(result == correct);
   }
diff --git a/lib/utils/test/src/utils/cli/cli_parse.cc b/lib/utils/test/src/utils/cli/cli_parse.cc
index 40dea86ae0..72a09efbde 100644
--- a/lib/utils/test/src/utils/cli/cli_parse.cc
+++ b/lib/utils/test/src/utils/cli/cli_parse.cc
@@ -24,8 +24,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         {},
     };
 
-    CLIFlagKey key_flag1 = CLIFlagKey{0};
-    CLIFlagKey key_flag2 = CLIFlagKey{1};
+    CLIFlagKey key_flag1 = CLIFlagKey{0_n};
+    CLIFlagKey key_flag2 = CLIFlagKey{1_n};
 
     SUBCASE("correctly parses short flag") {
       std::string input = "-2";
@@ -94,8 +94,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           },
           {},
       };
-      CLIFlagKey key_flag1 = CLIFlagKey{0};
-      CLIFlagKey key_flag2 = CLIFlagKey{1};
+      CLIFlagKey key_flag1 = CLIFlagKey{0_n};
+      CLIFlagKey key_flag2 = CLIFlagKey{1_n};
 
       SUBCASE("parses flags in any order") {
         std::vector<std::string> inputs = {"prog_name", "-2", "--flag1"};
@@ -180,8 +180,8 @@ TEST_SUITE(FF_TEST_SUITE) {
             },
         };
 
-        CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0};
-        CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1};
+        CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0_n};
+        CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1_n};
 
         SUBCASE("can parse multiple positional arguments") {
           std::vector<std::string> inputs = {"prog_name", "hello", "world"};
@@ -266,7 +266,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               },
           };
 
-          CLIPositionalArgumentKey key_posarg = CLIPositionalArgumentKey{0};
+          CLIPositionalArgumentKey key_posarg = CLIPositionalArgumentKey{0_n};
 
           SUBCASE(
               "succeeds if a positional argument is set to a valid choice") {
@@ -351,11 +351,11 @@ TEST_SUITE(FF_TEST_SUITE) {
               },
           },
       };
-      CLIFlagKey key_flag1 = CLIFlagKey{0};
-      CLIFlagKey key_flag2 = CLIFlagKey{1};
-      CLIFlagKey key_flag3 = CLIFlagKey{2};
-      CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0};
-      CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1};
+      CLIFlagKey key_flag1 = CLIFlagKey{0_n};
+      CLIFlagKey key_flag2 = CLIFlagKey{1_n};
+      CLIFlagKey key_flag3 = CLIFlagKey{2_n};
+      CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0_n};
+      CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1_n};
 
       SUBCASE("works if flags are before positional arguments") {
         std::vector<std::string> inputs = {
@@ -449,11 +449,11 @@ TEST_SUITE(FF_TEST_SUITE) {
             },
         },
     };
-    CLIFlagKey key_flag1 = CLIFlagKey{0};
-    CLIFlagKey key_flag2 = CLIFlagKey{1};
-    CLIFlagKey key_flag3 = CLIFlagKey{2};
-    CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0};
-    CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1};
+    CLIFlagKey key_flag1 = CLIFlagKey{0_n};
+    CLIFlagKey key_flag2 = CLIFlagKey{1_n};
+    CLIFlagKey key_flag3 = CLIFlagKey{2_n};
+    CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0_n};
+    CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1_n};
 
     int argc = 5;
     char const *argv[] = {"prog_name", "red", "-f", "world", "--flag3"};
diff --git a/lib/utils/test/src/utils/containers/at_idx.cc b/lib/utils/test/src/utils/containers/at_idx.cc
new file mode 100644
index 0000000000..b2a6286b62
--- /dev/null
+++ b/lib/utils/test/src/utils/containers/at_idx.cc
@@ -0,0 +1,29 @@
+#include "utils/containers/at_idx.h"
+#include "test/utils/doctest/fmt/optional.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("at_idx(std::vector<E>, nonnegative_int)") {
+    std::vector<int> vec = {1, 3, 2, 3};
+
+    SUBCASE("idx is in bounds") {
+      nonnegative_int idx = 1_n;
+
+      std::optional<int> result = at_idx(vec, idx);
+      std::optional<int> correct = 3;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("idx is out of bounds") {
+      nonnegative_int idx = 4_n;
+
+      std::optional<int> result = at_idx(vec, idx);
+      std::optional<int> correct = std::nullopt;
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/containers/enumerate.cc b/lib/utils/test/src/utils/containers/enumerate.cc
index 2f9a5b3c02..2fdb2e481e 100644
--- a/lib/utils/test/src/utils/containers/enumerate.cc
+++ b/lib/utils/test/src/utils/containers/enumerate.cc
@@ -17,26 +17,27 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("enumerate(std::vector<T>)") {
     std::vector<std::string> input = {"zero", "one", "two", "three"};
 
-    std::map<int, std::string> correct = {
-        {0, "zero"},
-        {1, "one"},
-        {2, "two"},
-        {3, "three"},
+    std::map<nonnegative_int, std::string> correct = {
+        {0_n, "zero"},
+        {1_n, "one"},
+        {2_n, "two"},
+        {3_n, "three"},
     };
 
-    std::map<int, std::string> result = enumerate(input);
+    std::map<nonnegative_int, std::string> result = enumerate(input);
 
     CHECK(result == correct);
 
     SUBCASE("check iteration order") {
-      std::vector<std::pair<int const, std::string>> iterated_result =
-          vector_of(result);
-      std::vector<std::pair<int const, std::string>> correct_iteration_order = {
-          {0, "zero"},
-          {1, "one"},
-          {2, "two"},
-          {3, "three"},
-      };
+      std::vector<std::pair<nonnegative_int const, std::string>>
+          iterated_result = vector_of(result);
+      std::vector<std::pair<nonnegative_int const, std::string>>
+          correct_iteration_order = {
+              {0_n, "zero"},
+              {1_n, "one"},
+              {2_n, "two"},
+              {3_n, "three"},
+          };
 
       CHECK(iterated_result == correct_iteration_order);
     }
@@ -45,9 +46,9 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("enumerate(std::unordered_set<T>)") {
     std::unordered_set<std::string> input = {"A", "B", "C", "D"};
 
-    std::unordered_set<int> correct_keys = {0, 1, 2, 3};
+    std::unordered_set<nonnegative_int> correct_keys = {0_n, 1_n, 2_n, 3_n};
     std::unordered_multiset<std::string> correct_values = {"A", "B", "C", "D"};
-    std::map<int, std::string> result = enumerate(input);
+    std::map<nonnegative_int, std::string> result = enumerate(input);
 
     CHECK(keys(result) == correct_keys);
     CHECK(unordered_multiset_of(values(result)) == correct_values);
diff --git a/lib/utils/test/src/utils/containers/enumerate_vector.cc b/lib/utils/test/src/utils/containers/enumerate_vector.cc
new file mode 100644
index 0000000000..fa5c5cf6fb
--- /dev/null
+++ b/lib/utils/test/src/utils/containers/enumerate_vector.cc
@@ -0,0 +1,33 @@
+#include "utils/containers/enumerate_vector.h"
+#include "test/utils/doctest/fmt/map.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("enumerate_vector(std::vector<T>)") {
+    SUBCASE("input vector is empty") {
+      std::vector<int> input = {};
+
+      std::map<nonnegative_int, int> result = enumerate_vector(input);
+      std::map<nonnegative_int, int> correct = {};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("input vector is not empty") {
+      std::vector<int> input = {2, 3, 1, 3, 3};
+
+      std::map<nonnegative_int, int> result = enumerate_vector(input);
+      std::map<nonnegative_int, int> correct = {
+          {0_n, 2},
+          {1_n, 3},
+          {2_n, 1},
+          {3_n, 3},
+          {4_n, 3},
+      };
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/containers/flatmap.cc b/lib/utils/test/src/utils/containers/flatmap.cc
index bd6d3ae5be..6a6d3c86a8 100644
--- a/lib/utils/test/src/utils/containers/flatmap.cc
+++ b/lib/utils/test/src/utils/containers/flatmap.cc
@@ -73,6 +73,38 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
   }
 
+  TEST_CASE("flatmap(std::string, F)") {
+    std::string input = "aBabcBc";
+
+    SUBCASE("replacement length > 1") {
+      std::string result = flatmap(input, [](char c) -> std::string {
+        if (c == 'B') {
+          return "..";
+        } else {
+          return std::string{c};
+        }
+      });
+
+      std::string correct = "a..abc..c";
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("replacement length == 0") {
+      std::string result = flatmap(input, [](char c) -> std::string {
+        if (c == 'B') {
+          return "";
+        } else {
+          return std::string{c};
+        }
+      });
+
+      std::string correct = "aabcc";
+
+      CHECK(result == correct);
+    }
+  }
+
   TEST_CASE("flatmap(std::unordered_map<K, V>, F)") {
     auto de_nest_keys = [](int k1,
                            std::unordered_map<int, std::string> const &v) {
diff --git a/lib/utils/test/src/utils/containers/get_all_permutations_with_repetition.cc b/lib/utils/test/src/utils/containers/get_all_permutations_with_repetition.cc
index f25bcf65b1..9fb4048691 100644
--- a/lib/utils/test/src/utils/containers/get_all_permutations_with_repetition.cc
+++ b/lib/utils/test/src/utils/containers/get_all_permutations_with_repetition.cc
@@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<int> input = {1, 2, 3};
 
       std::unordered_multiset<std::vector<int>> result =
-          get_all_permutations_with_repetition(input, 1);
+          get_all_permutations_with_repetition(input, 1_n);
       std::unordered_multiset<std::vector<int>> correct = {
           {1},
           {2},
@@ -27,7 +27,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<int> input = {1};
 
       std::unordered_multiset<std::vector<int>> result =
-          get_all_permutations_with_repetition(input, 2);
+          get_all_permutations_with_repetition(input, 2_n);
       std::unordered_multiset<std::vector<int>> correct = {
           {1, 1},
       };
@@ -39,7 +39,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<int> input = {1, 2};
 
       std::unordered_multiset<std::vector<int>> result =
-          get_all_permutations_with_repetition(input, 3);
+          get_all_permutations_with_repetition(input, 3_n);
       std::unordered_multiset<std::vector<int>> correct = {
           {1, 1, 1},
           {1, 1, 2},
@@ -58,7 +58,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<int> input = {1, 2, 2};
 
       std::unordered_multiset<std::vector<int>> result =
-          get_all_permutations_with_repetition(input, 2);
+          get_all_permutations_with_repetition(input, 2_n);
       std::unordered_multiset<std::vector<int>> correct = {{1, 1},
                                                            {1, 2},
                                                            {1, 2},
diff --git a/lib/utils/test/src/utils/containers/make.cc b/lib/utils/test/src/utils/containers/make.cc
new file mode 100644
index 0000000000..4070f5b35a
--- /dev/null
+++ b/lib/utils/test/src/utils/containers/make.cc
@@ -0,0 +1,15 @@
+#include "utils/containers/make.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("make") {
+    auto f = make<int>();
+
+    int result = f(true);
+    int correct = 1;
+
+    CHECK(result == correct);
+  }
+}
diff --git a/lib/utils/test/src/utils/containers/merge_maps.cc b/lib/utils/test/src/utils/containers/merge_maps.cc
index a083e94de3..4ec8054892 100644
--- a/lib/utils/test/src/utils/containers/merge_maps.cc
+++ b/lib/utils/test/src/utils/containers/merge_maps.cc
@@ -1,30 +1,80 @@
 #include "utils/containers/merge_maps.h"
 #include "test/utils/doctest/fmt/unordered_map.h"
 #include <doctest/doctest.h>
-#include <unordered_map>
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("merge_disjoint_maps") {
+    std::unordered_map<int, std::string> l_map = {
+        {1, "one"},
+        {2, "two"},
+    };
 
-  TEST_CASE("merge_maps") {
+    std::unordered_map<int, std::string> r_map = {
+        {3, "three"},
+    };
 
-    SUBCASE("disjoint keys") {
-      std::unordered_map<int, std::string> lhs = {{1, "one"}, {2, "two"}};
-      std::unordered_map<int, std::string> rhs = {{3, "three"}, {4, "four"}};
-
-      std::unordered_map<int, std::string> result = merge_maps(lhs, rhs);
-      std::unordered_map<int, std::string> correct = {
-          {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"}};
+    std::unordered_map<int, std::string> correct = {
+        {1, "one"},
+        {2, "two"},
+        {3, "three"},
+    };
+    SUBCASE("maps are disjoint") {
+      std::unordered_map<int, std::string> result =
+          merge_disjoint_maps(l_map, r_map);
 
       CHECK(result == correct);
     }
 
-    SUBCASE("overlapping keys") {
-      std::unordered_map<int, std::string> lhs = {{1, "one"}, {2, "two"}};
-      std::unordered_map<int, std::string> rhs = {{2, "three"}, {3, "four"}};
-
-      CHECK_THROWS(merge_maps(lhs, rhs));
+    SUBCASE("maps are not disjoint") {
+      CHECK_THROWS(merge_disjoint_maps(l_map, l_map));
     }
   }
+
+  TEST_CASE("merge_map_left_dominates") {
+    std::unordered_map<int, std::string> l_map = {
+        {1, "one"},
+        {2, "left_two"},
+    };
+
+    std::unordered_map<int, std::string> r_map = {
+        {2, "right_two"},
+        {3, "three"},
+    };
+
+    std::unordered_map<int, std::string> correct = {
+        {1, "one"},
+        {2, "left_two"},
+        {3, "three"},
+    };
+
+    std::unordered_map<int, std::string> result =
+        merge_map_left_dominates(l_map, r_map);
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("merge_map_right_dominates") {
+    std::unordered_map<int, std::string> l_map = {
+        {1, "one"},
+        {2, "left_two"},
+    };
+
+    std::unordered_map<int, std::string> r_map = {
+        {2, "right_two"},
+        {3, "three"},
+    };
+
+    std::unordered_map<int, std::string> correct = {
+        {1, "one"},
+        {2, "right_two"},
+        {3, "three"},
+    };
+
+    std::unordered_map<int, std::string> result =
+        merge_map_right_dominates(l_map, r_map);
+
+    CHECK(result == correct);
+  }
 }
diff --git a/lib/utils/test/src/utils/containers/product.cc b/lib/utils/test/src/utils/containers/product.cc
index 3fa94c8e9e..2278bfba17 100644
--- a/lib/utils/test/src/utils/containers/product.cc
+++ b/lib/utils/test/src/utils/containers/product.cc
@@ -1,4 +1,6 @@
 #include "utils/containers/product.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include <climits>
 #include <doctest/doctest.h>
 #include <set>
 #include <unordered_set>
@@ -29,4 +31,22 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(correct == result);
     }
   }
+
+  TEST_CASE("product(std::vector<nonnegative_int>)") {
+    SUBCASE("non-empty container") {
+      std::vector<nonnegative_int> input = {1_n, 2_n, 3_n, 5_n};
+      nonnegative_int correct = 30_n;
+      auto result = product(input);
+      CHECK(correct == result);
+    }
+
+    SUBCASE("empty container") {
+      std::vector<nonnegative_int> input = {5_n};
+      nonnegative_int correct = 5_n;
+      // correct = nonnegative_int{x};
+      // CHECK(x == 3);
+      nonnegative_int result = product(input);
+      CHECK(correct == correct);
+    }
+  }
 }
diff --git a/lib/utils/test/src/utils/containers/repeat.cc b/lib/utils/test/src/utils/containers/repeat.cc
index d8ffe76a64..d2fc595f49 100644
--- a/lib/utils/test/src/utils/containers/repeat.cc
+++ b/lib/utils/test/src/utils/containers/repeat.cc
@@ -7,7 +7,7 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("repeat") {
     int x = 0;
-    std::vector<int> result = repeat(3, [&]() {
+    std::vector<int> result = repeat(3_n, [&]() {
       int result = x;
       x += 2;
       return result;
diff --git a/lib/utils/test/src/utils/containers/replicate.cc b/lib/utils/test/src/utils/containers/repeat_element.cc
similarity index 69%
rename from lib/utils/test/src/utils/containers/replicate.cc
rename to lib/utils/test/src/utils/containers/repeat_element.cc
index 1c7845642e..08bee8bec8 100644
--- a/lib/utils/test/src/utils/containers/replicate.cc
+++ b/lib/utils/test/src/utils/containers/repeat_element.cc
@@ -1,4 +1,4 @@
-#include "utils/containers/replicate.h"
+#include "utils/containers/repeat_element.h"
 #include "test/utils/doctest/fmt/unordered_set.h"
 #include "test/utils/doctest/fmt/vector.h"
 #include <doctest/doctest.h>
@@ -7,16 +7,17 @@
 using namespace FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("replicate") {
+  TEST_CASE("repeat_element") {
     SUBCASE("ints") {
       int x = 42;
-      std::vector<int> result = replicate(5, x);
+      std::vector<int> result = repeat_element(nonnegative_int{5}, x);
       std::vector<int> correct = {42, 42, 42, 42, 42};
       CHECK(result == correct);
     }
     SUBCASE("unordered_set") {
       std::unordered_set<float> x = {1.0, 1.5};
-      std::vector<std::unordered_set<float>> result = replicate(3, x);
+      std::vector<std::unordered_set<float>> result =
+          repeat_element(nonnegative_int{3}, x);
       std::vector<std::unordered_set<float>> correct = {
           {1.0, 1.5}, {1.0, 1.5}, {1.0, 1.5}};
       CHECK(result == correct);
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms.cc
index 25f990f80e..ff491f6b85 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms.cc
@@ -12,19 +12,19 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_inputs/get_outputs") {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({}, 1);
+    NodeAddedResult n2_added = g.add_node({}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({}, 1);
+    NodeAddedResult n3_added = g.add_node({}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o1, o2, o3}, 1);
+    NodeAddedResult n4_added = g.add_node({o1, o2, o3}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
@@ -44,15 +44,15 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("topological_ordering") {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    NodeAddedResult n2_added = g.add_node({o1}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o2}, 1);
+    NodeAddedResult n3_added = g.add_node({o2}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/dataflow_graphs_are_isomorphic.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/dataflow_graphs_are_isomorphic.cc
index f991b4a65e..0f812f2dec 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/dataflow_graphs_are_isomorphic.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/dataflow_graphs_are_isomorphic.cc
@@ -11,21 +11,21 @@ TEST_SUITE(FF_TEST_SUITE) {
       "dataflow_graphs_are_isomorphic(DataflowGraphView, DataflowGraphView)") {
     auto g1 = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult g1_n1_added = g1.add_node({}, 1);
+    NodeAddedResult g1_n1_added = g1.add_node({}, 1_n);
     Node g1_n1_node = g1_n1_added.node;
     DataflowOutput g1_n1_output = get_only(g1_n1_added.outputs);
 
-    NodeAddedResult g1_n2_added = g1.add_node({g1_n1_output}, 1);
+    NodeAddedResult g1_n2_added = g1.add_node({g1_n1_output}, 1_n);
     Node g1_n2_node = g1_n2_added.node;
 
     auto g2 = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
     SUBCASE("input graphs are isomorphic") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
-      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1);
+      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
 
       bool correct = true;
@@ -36,12 +36,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("input graphs are not isomorphic (different connectivity)") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
       NodeAddedResult g2_n2_added =
-          g2.add_node({g2_n1_output, g2_n1_output}, 1);
+          g2.add_node({g2_n1_output, g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
 
       bool correct = false;
@@ -53,14 +53,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("input graphs are not isomorphic (different number of src and sink "
             "nodes)") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
-      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1);
+      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
 
-      NodeAddedResult g2_n3_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n3_added = g2.add_node({}, 1_n);
       Node g2_n3_node = g2_n3_added.node;
 
       bool correct = false;
@@ -72,15 +72,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("input graphs are not isomorphic (different number of internal "
             "nodes)") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
-      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1);
+      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
       DataflowOutput g2_n2_output = get_only(g2_n2_added.outputs);
 
-      NodeAddedResult g2_n3_added = g2.add_node({g2_n2_output}, 1);
+      NodeAddedResult g2_n3_added = g2.add_node({g2_n2_output}, 1_n);
       Node g2_n3_node = g2_n3_added.node;
 
       bool correct = false;
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/find_isomorphism.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/find_isomorphism.cc
index 160e4c4f73..8974d09832 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/find_isomorphism.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/find_isomorphism.cc
@@ -10,21 +10,21 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("find_isomorphism(DataflowGraphView, DataflowGraphView)") {
     auto g1 = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult g1_n1_added = g1.add_node({}, 1);
+    NodeAddedResult g1_n1_added = g1.add_node({}, 1_n);
     Node g1_n1_node = g1_n1_added.node;
     DataflowOutput g1_n1_output = get_only(g1_n1_added.outputs);
 
-    NodeAddedResult g1_n2_added = g1.add_node({g1_n1_output}, 1);
+    NodeAddedResult g1_n2_added = g1.add_node({g1_n1_output}, 1_n);
     Node g1_n2_node = g1_n2_added.node;
 
     auto g2 = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
     SUBCASE("input graphs are isomorphic") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
-      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1);
+      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
 
       std::optional<DataflowGraphIsomorphism> correct_isomorphism =
@@ -41,12 +41,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("input graphs are not isomorphic (different connectivity)") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
       NodeAddedResult g2_n2_added =
-          g2.add_node({g2_n1_output, g2_n1_output}, 1);
+          g2.add_node({g2_n1_output, g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
 
       std::optional<DataflowGraphIsomorphism> correct_isomorphism =
@@ -59,14 +59,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("input graphs are not isomorphic (different number of src and sink "
             "nodes)") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
-      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1);
+      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
 
-      NodeAddedResult g2_n3_added = g2.add_node({}, 0);
+      NodeAddedResult g2_n3_added = g2.add_node({}, 0_n);
       Node g2_n3_node = g2_n3_added.node;
 
       std::optional<DataflowGraphIsomorphism> correct_isomorphism =
@@ -79,15 +79,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("input graphs are not isomorphic (different number of internal "
             "nodes)") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
-      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1);
+      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
       DataflowOutput g2_n2_output = get_only(g2_n2_added.outputs);
 
-      NodeAddedResult g2_n3_added = g2.add_node({g2_n2_output}, 1);
+      NodeAddedResult g2_n3_added = g2.add_node({g2_n2_output}, 1_n);
       Node g2_n3_node = g2_n3_added.node;
 
       std::optional<DataflowGraphIsomorphism> correct_isomorphism =
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc
index fec5d3401e..e619cc3b1c 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc
@@ -11,12 +11,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
     SUBCASE("gets edges if there are multiple") {
-      NodeAddedResult n1_added = g.add_node({}, 2);
+      NodeAddedResult n1_added = g.add_node({}, 2_n);
       Node n1 = n1_added.node;
       DataflowOutput n1_o0 = n1_added.outputs.at(0);
       DataflowOutput n1_o1 = n1_added.outputs.at(1);
 
-      NodeAddedResult n2_added = g.add_node({n1_o0, n1_o0, n1_o1}, 0);
+      NodeAddedResult n2_added = g.add_node({n1_o0, n1_o0, n1_o1}, 0_n);
       Node n2 = n2_added.node;
 
       std::unordered_set<DataflowEdge> result =
@@ -24,15 +24,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_set<DataflowEdge> correct = {
           DataflowEdge{
               n1_o0,
-              DataflowInput{n2, 0},
+              DataflowInput{n2, 0_n},
           },
           DataflowEdge{
               n1_o0,
-              DataflowInput{n2, 1},
+              DataflowInput{n2, 1_n},
           },
           DataflowEdge{
               n1_o1,
-              DataflowInput{n2, 2},
+              DataflowInput{n2, 2_n},
           },
       };
 
@@ -40,15 +40,15 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("does not get edges to/from other nodes") {
-      NodeAddedResult n1_added = g.add_node({}, 1);
+      NodeAddedResult n1_added = g.add_node({}, 1_n);
       Node n1 = n1_added.node;
       DataflowOutput o1 = get_only(n1_added.outputs);
 
-      NodeAddedResult n2_added = g.add_node({o1}, 1);
+      NodeAddedResult n2_added = g.add_node({o1}, 1_n);
       Node n2 = n2_added.node;
       DataflowOutput o2 = get_only(n2_added.outputs);
 
-      NodeAddedResult n3_added = g.add_node({o2}, 1);
+      NodeAddedResult n3_added = g.add_node({o2}, 1_n);
       Node n3 = n3_added.node;
       DataflowOutput o3 = get_only(n3_added.outputs);
 
@@ -61,11 +61,11 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE(
         "does not get flipped edges (i.e., respects from vs to direction)") {
-      NodeAddedResult n1_added = g.add_node({}, 1);
+      NodeAddedResult n1_added = g.add_node({}, 1_n);
       Node n1 = n1_added.node;
       DataflowOutput o1 = get_only(n1_added.outputs);
 
-      NodeAddedResult n2_added = g.add_node({o1}, 0);
+      NodeAddedResult n2_added = g.add_node({o1}, 0_n);
       Node n2 = n2_added.node;
 
       std::unordered_set<DataflowEdge> result =
@@ -76,10 +76,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("returns empty set if no edges exist between the given nodes") {
-      NodeAddedResult n1_added = g.add_node({}, 1);
+      NodeAddedResult n1_added = g.add_node({}, 1_n);
       Node n1 = n1_added.node;
 
-      NodeAddedResult n2_added = g.add_node({}, 1);
+      NodeAddedResult n2_added = g.add_node({}, 1_n);
       Node n2 = n2_added.node;
 
       std::unordered_set<DataflowEdge> result =
@@ -91,7 +91,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("returns empty set if src node == dst node (as cycles cannot exist "
             "in DataflowGraph") {
-      NodeAddedResult n1_added = g.add_node({}, 1);
+      NodeAddedResult n1_added = g.add_node({}, 1_n);
       Node n1 = n1_added.node;
 
       std::unordered_set<DataflowEdge> result =
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
index 86e4802cdb..f55afbacc1 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
@@ -10,34 +10,34 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_incoming_edges(DataflowGraphView, Node)") {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({}, 1);
+    NodeAddedResult n2_added = g.add_node({}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o2}, 1);
+    NodeAddedResult n3_added = g.add_node({o2}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o2, o3}, 1);
+    NodeAddedResult n4_added = g.add_node({o2, o3}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
     SUBCASE("n4 - multiple incoming edges") {
       std::vector<DataflowEdge> result = get_incoming_edges(g, n4);
       std::vector<DataflowEdge> correct = {
-          DataflowEdge{o2, DataflowInput{n4, 0}},
-          DataflowEdge{o3, DataflowInput{n4, 1}}};
+          DataflowEdge{o2, DataflowInput{n4, 0_n}},
+          DataflowEdge{o3, DataflowInput{n4, 1_n}}};
       CHECK(result == correct);
     }
 
     SUBCASE("n3- single incoming edge") {
       std::vector<DataflowEdge> result = get_incoming_edges(g, n3);
       std::vector<DataflowEdge> correct = {
-          DataflowEdge{o2, DataflowInput{n3, 0}},
+          DataflowEdge{o2, DataflowInput{n3, 0_n}},
       };
       CHECK(result == correct);
     }
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
index be874b7e29..c37dcf5be7 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
@@ -10,26 +10,26 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_outgoing_edges(DataflowGraphView, Node)") {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    NodeAddedResult n2_added = g.add_node({o1}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o1}, 1);
+    NodeAddedResult n3_added = g.add_node({o1}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o2}, 1);
+    NodeAddedResult n4_added = g.add_node({o2}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
     SUBCASE("n2 - single outgoing edge") {
       std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, n2);
       std::unordered_set<DataflowEdge> correct = {
-          DataflowEdge{o2, DataflowInput{n4, 0}},
+          DataflowEdge{o2, DataflowInput{n4, 0_n}},
       };
       CHECK(result == correct);
     }
@@ -37,8 +37,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("n1 - multiple outgoing edges") {
       std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, n1);
       std::unordered_set<DataflowEdge> correct = {
-          DataflowEdge{o1, DataflowInput{n2, 0}},
-          DataflowEdge{o1, DataflowInput{n3, 0}},
+          DataflowEdge{o1, DataflowInput{n2, 0_n}},
+          DataflowEdge{o1, DataflowInput{n3, 0_n}},
       };
       CHECK(result == correct);
     }
@@ -53,19 +53,19 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_outgoing_edges(DataflowGraphView, std::unordered_set<Node>)") {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    NodeAddedResult n2_added = g.add_node({o1}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o1}, 1);
+    NodeAddedResult n3_added = g.add_node({o1}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o2}, 1);
+    NodeAddedResult n4_added = g.add_node({o2}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
@@ -73,9 +73,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_set<Node> nodes = {n1, n2};
       std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, nodes);
       std::unordered_set<DataflowEdge> correct = {
-          DataflowEdge{o1, DataflowInput{n2, 0}},
-          DataflowEdge{o1, DataflowInput{n3, 0}},
-          DataflowEdge{o2, DataflowInput{n4, 0}},
+          DataflowEdge{o1, DataflowInput{n2, 0_n}},
+          DataflowEdge{o1, DataflowInput{n3, 0_n}},
+          DataflowEdge{o2, DataflowInput{n4, 0_n}},
       };
       CHECK(result == correct);
     }
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
index 330628adfd..6c770a9d29 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
@@ -11,19 +11,19 @@ TEST_SUITE(FF_TEST_SUITE) {
             "std::unordered_set<Node>") {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    NodeAddedResult n2_added = g.add_node({o1}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o1, o2, o1}, 1);
+    NodeAddedResult n3_added = g.add_node({o1, o2, o1}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o2, o3}, 1);
+    NodeAddedResult n4_added = g.add_node({o2, o3}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
@@ -33,9 +33,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         get_subgraph_incoming_edges(g, input_node_set);
 
     std::unordered_set<DataflowEdge> correct = {
-        DataflowEdge{o1, DataflowInput{n2, 0}},
-        DataflowEdge{o1, DataflowInput{n3, 0}},
-        DataflowEdge{o1, DataflowInput{n3, 2}},
+        DataflowEdge{o1, DataflowInput{n2, 0_n}},
+        DataflowEdge{o1, DataflowInput{n3, 0_n}},
+        DataflowEdge{o1, DataflowInput{n3, 2_n}},
     };
 
     CHECK(result == correct);
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc
index 779d0a9560..bb7f3c4c30 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc
@@ -11,19 +11,19 @@ TEST_SUITE(FF_TEST_SUITE) {
             "std::unordered_set<Node>") {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    NodeAddedResult n2_added = g.add_node({o1}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o2}, 1);
+    NodeAddedResult n3_added = g.add_node({o2}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o1, o2, o3}, 1);
+    NodeAddedResult n4_added = g.add_node({o1, o2, o3}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
@@ -33,8 +33,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         get_subgraph_outgoing_edges(g, input_node_set);
 
     std::unordered_set<DataflowEdge> correct = {
-        DataflowEdge{o2, DataflowInput{n4, 1}},
-        DataflowEdge{o3, DataflowInput{n4, 2}},
+        DataflowEdge{o2, DataflowInput{n4, 1_n}},
+        DataflowEdge{o3, DataflowInput{n4, 2_n}},
     };
 
     CHECK(result == correct);
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_boundary_nodes_for_split.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_boundary_nodes_for_split.cc
index c35789044d..4e26812315 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_boundary_nodes_for_split.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_boundary_nodes_for_split.cc
@@ -19,19 +19,19 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    NodeAddedResult n2_added = g.add_node({o1}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o1, o2}, 1);
+    NodeAddedResult n3_added = g.add_node({o1, o2}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o2, o3}, 1);
+    NodeAddedResult n4_added = g.add_node({o2, o3}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_edges_across_split.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_edges_across_split.cc
index 1f8f66b932..38b722ec70 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_edges_across_split.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_edges_across_split.cc
@@ -25,19 +25,19 @@ TEST_SUITE(FF_TEST_SUITE) {
     auto make_leaf = [](Node const &n) { return BinarySPDecompositionTree{n}; };
 
     SUBCASE("multiple nodes with edges across") {
-      NodeAddedResult n1_added = g.add_node({}, 1);
+      NodeAddedResult n1_added = g.add_node({}, 1_n);
       Node n1 = n1_added.node;
       DataflowOutput o1 = get_only(n1_added.outputs);
 
-      NodeAddedResult n2_added = g.add_node({}, 1);
+      NodeAddedResult n2_added = g.add_node({}, 1_n);
       Node n2 = n2_added.node;
       DataflowOutput o2 = get_only(n2_added.outputs);
 
-      NodeAddedResult n3_added = g.add_node({o2, o1}, 1);
+      NodeAddedResult n3_added = g.add_node({o2, o1}, 1_n);
       Node n3 = n3_added.node;
       DataflowOutput o3 = get_only(n3_added.outputs);
 
-      NodeAddedResult n4_added = g.add_node({o1}, 1);
+      NodeAddedResult n4_added = g.add_node({o1}, 1_n);
       Node n4 = n4_added.node;
       DataflowOutput o4 = get_only(n4_added.outputs);
 
@@ -54,15 +54,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_set<DataflowEdge> correct = {
           DataflowEdge{
               o1,
-              DataflowInput{n3, 1},
+              DataflowInput{n3, 1_n},
           },
           DataflowEdge{
               o2,
-              DataflowInput{n3, 0},
+              DataflowInput{n3, 0_n},
           },
           DataflowEdge{
               o1,
-              DataflowInput{n4, 0},
+              DataflowInput{n4, 0_n},
           },
       };
 
@@ -70,12 +70,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("nodes each have multiple edges across") {
-      NodeAddedResult n1_added = g.add_node({}, 2);
+      NodeAddedResult n1_added = g.add_node({}, 2_n);
       Node n1 = n1_added.node;
       DataflowOutput n1_o1 = n1_added.outputs.at(0);
       DataflowOutput n1_o2 = n1_added.outputs.at(1);
 
-      NodeAddedResult n2_added = g.add_node({n1_o1, n1_o2, n1_o1}, 1);
+      NodeAddedResult n2_added = g.add_node({n1_o1, n1_o2, n1_o1}, 1_n);
       Node n2 = n2_added.node;
 
       TransitiveReducedDataflowGraphView tr_g =
@@ -91,15 +91,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_set<DataflowEdge> correct = {
           DataflowEdge{
               n1_o1,
-              DataflowInput{n2, 0},
+              DataflowInput{n2, 0_n},
           },
           DataflowEdge{
               n1_o2,
-              DataflowInput{n2, 1},
+              DataflowInput{n2, 1_n},
           },
           DataflowEdge{
               n1_o1,
-              DataflowInput{n2, 2},
+              DataflowInput{n2, 2_n},
           },
       };
 
@@ -107,19 +107,19 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("does not return edges eliminated by transitive reduction") {
-      NodeAddedResult n1_added = g.add_node({}, 1);
+      NodeAddedResult n1_added = g.add_node({}, 1_n);
       Node n1 = n1_added.node;
       DataflowOutput o1 = get_only(n1_added.outputs);
 
-      NodeAddedResult n2_added = g.add_node({o1}, 1);
+      NodeAddedResult n2_added = g.add_node({o1}, 1_n);
       Node n2 = n2_added.node;
       DataflowOutput o2 = get_only(n2_added.outputs);
 
-      NodeAddedResult n3_added = g.add_node({o1, o2}, 1);
+      NodeAddedResult n3_added = g.add_node({o1, o2}, 1_n);
       Node n3 = n3_added.node;
       DataflowOutput o3 = get_only(n3_added.outputs);
 
-      NodeAddedResult n4_added = g.add_node({o2, o3}, 1);
+      NodeAddedResult n4_added = g.add_node({o2, o3}, 1_n);
       Node n4 = n4_added.node;
       DataflowOutput o4 = get_only(n4_added.outputs);
 
@@ -136,7 +136,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_set<DataflowEdge> correct = {
           DataflowEdge{
               o2,
-              DataflowInput{n3, 1},
+              DataflowInput{n3, 1_n},
           },
       };
 
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_outputs_across_split.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_outputs_across_split.cc
index 0e77739434..f922721fde 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_outputs_across_split.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_outputs_across_split.cc
@@ -19,19 +19,19 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    NodeAddedResult n2_added = g.add_node({o1}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o1, o2}, 1);
+    NodeAddedResult n3_added = g.add_node({o1, o2}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o2, o3}, 1);
+    NodeAddedResult n4_added = g.add_node({o2, o3}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/unordered_open_dataflow_graph.cc b/lib/utils/test/src/utils/graph/dataflow_graph/unordered_open_dataflow_graph.cc
index 7a3237d432..ec3ad86fe6 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/unordered_open_dataflow_graph.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/unordered_open_dataflow_graph.cc
@@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       REQUIRE(result == correct);
     }
 
-    NodeAddedResult added = g.add_node({}, 2);
+    NodeAddedResult added = g.add_node({}, 2_n);
 
     {
       std::unordered_set<Node> result = g.query_nodes(node_query_all());
@@ -54,7 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       REQUIRE(result == correct);
     }
 
-    NodeAddedResult added2 = g.add_node(added.outputs, 3);
+    NodeAddedResult added2 = g.add_node(added.outputs, 3_n);
 
     {
       std::unordered_set<Node> result = g.query_nodes(node_query_all());
@@ -66,8 +66,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_set<DataflowEdge> result =
           g.query_edges(dataflow_edge_query_all());
       std::unordered_set<DataflowEdge> correct = {
-          DataflowEdge{added.outputs.at(0), DataflowInput{added2.node, 0}},
-          DataflowEdge{added.outputs.at(1), DataflowInput{added2.node, 1}},
+          DataflowEdge{added.outputs.at(0), DataflowInput{added2.node, 0_n}},
+          DataflowEdge{added.outputs.at(1), DataflowInput{added2.node, 1_n}},
       };
       REQUIRE(result == correct);
     }
diff --git a/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_edges.cc b/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_edges.cc
index 93d3d9605b..d9d91a03e9 100644
--- a/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_edges.cc
+++ b/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_edges.cc
@@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("add_edges(MultiDiGraph &, std::vector<std::pair<Node, Node>>)") {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
 
-    std::vector<Node> n = add_nodes(g, 3);
+    std::vector<Node> n = add_nodes(g, 3_n);
 
     std::vector<std::pair<Node, Node>> input = {
         {n.at(0), n.at(1)},
diff --git a/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_nodes.cc b/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_nodes.cc
index e41bf33d6c..e3d9ee6a29 100644
--- a/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_nodes.cc
+++ b/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_nodes.cc
@@ -9,7 +9,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("add_nodes(MultiDiGraph &, int)") {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
 
-    std::unordered_set<Node> result = unordered_set_of(add_nodes(g, 3));
+    std::unordered_set<Node> result = unordered_set_of(add_nodes(g, 3_n));
     std::unordered_set<Node> correct = g.query_nodes(node_query_all());
 
     CHECK(result == correct);
diff --git a/lib/utils/test/src/utils/graph/multidigraph/algorithms/get_edges.cc b/lib/utils/test/src/utils/graph/multidigraph/algorithms/get_edges.cc
index aef6d9baff..0dfcc8a851 100644
--- a/lib/utils/test/src/utils/graph/multidigraph/algorithms/get_edges.cc
+++ b/lib/utils/test/src/utils/graph/multidigraph/algorithms/get_edges.cc
@@ -11,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_edges(MultiDiGraphView)") {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
 
-    std::vector<Node> n = add_nodes(g, 3);
+    std::vector<Node> n = add_nodes(g, 3_n);
     std::vector<MultiDiEdge> e = add_edges(g,
                                            {
                                                {n.at(0), n.at(1)},
diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/find_isomorphism.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/find_isomorphism.cc
index 78aaa8d9fc..55b7b34e52 100644
--- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/find_isomorphism.cc
+++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/find_isomorphism.cc
@@ -26,12 +26,13 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("input graphs are not empty") {
       DataflowGraphInput g1_i1 = g1.add_input();
-      NodeAddedResult g1_n1_added = g1.add_node({OpenDataflowValue{g1_i1}}, 1);
+      NodeAddedResult g1_n1_added =
+          g1.add_node({OpenDataflowValue{g1_i1}}, 1_n);
       Node g1_n1_node = g1_n1_added.node;
       DataflowOutput g1_n1_output = get_only(g1_n1_added.outputs);
 
       NodeAddedResult g1_n2_added = g1.add_node(
-          {OpenDataflowValue{g1_i1}, OpenDataflowValue{g1_n1_output}}, 1);
+          {OpenDataflowValue{g1_i1}, OpenDataflowValue{g1_n1_output}}, 1_n);
       Node g1_n2_node = g1_n2_added.node;
 
       SUBCASE("one graph is empty") {
@@ -46,11 +47,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("input graphs are isomorphic") {
         DataflowGraphInput g2_i1 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
-            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1);
+            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n);
         Node g2_n2_node = g2_n2_added.node;
 
         std::optional<OpenDataflowGraphIsomorphism> correct =
@@ -75,11 +76,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         DataflowGraphInput g2_i1 = g2.add_input();
         DataflowGraphInput g2_i2 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
-            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1);
+            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n);
         Node g2_n2_node = g2_n2_added.node;
 
         std::optional<OpenDataflowGraphIsomorphism> correct = std::nullopt;
@@ -93,12 +94,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("input graphs are not isomorphic (different connectivity)") {
         DataflowGraphInput g2_i1 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
             {OpenDataflowValue{g2_n1_output}, OpenDataflowValue{g2_n1_output}},
-            1);
+            1_n);
         Node g2_n2_node = g2_n2_added.node;
 
         std::optional<OpenDataflowGraphIsomorphism> correct = std::nullopt;
@@ -112,14 +113,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("input graphs are not isomorphic (different numbers of nodes)") {
         DataflowGraphInput g2_i1 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
-            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1);
+            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n);
         Node g2_n2_node = g2_n2_added.node;
 
-        NodeAddedResult g2_n3_added = g2.add_node({}, 0);
+        NodeAddedResult g2_n3_added = g2.add_node({}, 0_n);
         Node g2_n3_node = g2_n3_added.node;
 
         std::optional<OpenDataflowGraphIsomorphism> correct = std::nullopt;
diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.cc
index ff75e8fe48..fd54b801ce 100644
--- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.cc
+++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.cc
@@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     DataflowGraphInput i0 = g.add_input();
     DataflowGraphInput i1 = g.add_input();
 
-    NodeAddedResult n0_added = g.add_node({}, 1);
+    NodeAddedResult n0_added = g.add_node({}, 1_n);
 
     std::unordered_set<DataflowGraphInput> result =
         get_open_dataflow_graph_inputs(g);
diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_value_uses.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_value_uses.cc
index 7496c3009d..c7d294a588 100644
--- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_value_uses.cc
+++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_value_uses.cc
@@ -18,19 +18,19 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       NodeAddedResult n0_added = g.add_node(
           {OpenDataflowValue{i0}, OpenDataflowValue{i1}, OpenDataflowValue{i0}},
-          1);
+          1_n);
       Node n0 = n0_added.node;
       DataflowOutput o0 = get_only(n0_added.outputs);
 
       NodeAddedResult n1_added = g.add_node(
           {OpenDataflowValue{i1}, OpenDataflowValue{o0}, OpenDataflowValue{i0}},
-          1);
+          1_n);
       Node n1 = n1_added.node;
 
       std::unordered_set<DataflowInput> correct = {
-          DataflowInput{n0, 0},
-          DataflowInput{n0, 2},
-          DataflowInput{n1, 2},
+          DataflowInput{n0, 0_n},
+          DataflowInput{n0, 2_n},
+          DataflowInput{n1, 2_n},
       };
 
       std::unordered_set<DataflowInput> result =
@@ -45,7 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       DataflowGraphInput i0 = g.add_input();
 
-      NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 2);
+      NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 2_n);
       Node n0 = n0_added.node;
       DataflowOutput o0_0 = n0_added.outputs.at(0);
       DataflowOutput o0_1 = n0_added.outputs.at(1);
@@ -53,16 +53,16 @@ TEST_SUITE(FF_TEST_SUITE) {
       NodeAddedResult n1_added = g.add_node({OpenDataflowValue{i0},
                                              OpenDataflowValue{o0_1},
                                              OpenDataflowValue{o0_0}},
-                                            1);
+                                            1_n);
       Node n1 = n1_added.node;
 
       NodeAddedResult n2_added =
-          g.add_node({OpenDataflowValue{o0_1}, OpenDataflowValue{i0}}, 1);
+          g.add_node({OpenDataflowValue{o0_1}, OpenDataflowValue{i0}}, 1_n);
       Node n2 = n2_added.node;
 
       std::unordered_set<DataflowInput> correct = {
-          DataflowInput{n1, 1},
-          DataflowInput{n2, 0},
+          DataflowInput{n1, 1_n},
+          DataflowInput{n2, 0_n},
       };
 
       std::unordered_set<DataflowInput> result =
diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_unused_open_dataflow_graph_inputs.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_unused_open_dataflow_graph_inputs.cc
index ddd6d74119..e1a2062865 100644
--- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_unused_open_dataflow_graph_inputs.cc
+++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_unused_open_dataflow_graph_inputs.cc
@@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       DataflowGraphInput g_i2 = g.add_input();
       DataflowGraphInput g_i3 = g.add_input();
 
-      NodeAddedResult g_n1_added = g.add_node({OpenDataflowValue{g_i2}}, 1);
+      NodeAddedResult g_n1_added = g.add_node({OpenDataflowValue{g_i2}}, 1_n);
 
       std::unordered_set<DataflowGraphInput> result =
           get_unused_open_dataflow_graph_inputs(g);
@@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       DataflowGraphInput g_i2 = g.add_input();
 
       NodeAddedResult g_n1_added =
-          g.add_node({OpenDataflowValue{g_i1}, OpenDataflowValue{g_i2}}, 1);
+          g.add_node({OpenDataflowValue{g_i1}, OpenDataflowValue{g_i2}}, 1_n);
 
       std::unordered_set<DataflowGraphInput> result =
           get_unused_open_dataflow_graph_inputs(g);
diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graphs_are_isomorphic.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graphs_are_isomorphic.cc
index bdb1bb4814..c53e069f68 100644
--- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graphs_are_isomorphic.cc
+++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graphs_are_isomorphic.cc
@@ -21,12 +21,13 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("input graphs are not empty") {
       DataflowGraphInput g1_i1 = g1.add_input();
-      NodeAddedResult g1_n1_added = g1.add_node({OpenDataflowValue{g1_i1}}, 1);
+      NodeAddedResult g1_n1_added =
+          g1.add_node({OpenDataflowValue{g1_i1}}, 1_n);
       Node g1_n1_node = g1_n1_added.node;
       DataflowOutput g1_n1_output = get_only(g1_n1_added.outputs);
 
       NodeAddedResult g1_n2_added = g1.add_node(
-          {OpenDataflowValue{g1_i1}, OpenDataflowValue{g1_n1_output}}, 1);
+          {OpenDataflowValue{g1_i1}, OpenDataflowValue{g1_n1_output}}, 1_n);
       Node g1_n2_node = g1_n2_added.node;
 
       SUBCASE("one input graph is empty") {
@@ -39,11 +40,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("input graphs are isomorphic") {
         DataflowGraphInput g2_i1 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
-            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1);
+            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n);
         Node g2_n2_node = g2_n2_added.node;
 
         bool correct = true;
@@ -57,11 +58,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         DataflowGraphInput g2_i1 = g2.add_input();
         DataflowGraphInput g2_i2 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
-            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1);
+            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n);
         Node g2_n2_node = g2_n2_added.node;
 
         bool correct = false;
@@ -73,12 +74,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("input graphs are not isomorphic (different connectivity)") {
         DataflowGraphInput g2_i1 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
             {OpenDataflowValue{g2_n1_output}, OpenDataflowValue{g2_n1_output}},
-            1);
+            1_n);
         Node g2_n2_node = g2_n2_added.node;
 
         bool correct = false;
@@ -90,14 +91,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("input graphs are not isomorphic (different numbers of nodes)") {
         DataflowGraphInput g2_i1 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
-            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1);
+            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n);
         Node g2_n2_node = g2_n2_added.node;
 
-        NodeAddedResult g2_n3_added = g2.add_node({}, 0);
+        NodeAddedResult g2_n3_added = g2.add_node({}, 0_n);
         Node g2_n3_node = g2_n3_added.node;
 
         bool correct = false;
diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_input_ids.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_input_ids.cc
index b565e46e67..90682cf0f0 100644
--- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_input_ids.cc
+++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_input_ids.cc
@@ -17,11 +17,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     DataflowGraphInput i0 = g.add_input();
     DataflowGraphInput i1 = g.add_input();
 
-    NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1);
+    NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1_n);
     Node n0 = n0_added.node;
     DataflowOutput n0_output = get_only(n0_added.outputs);
 
-    NodeAddedResult n1_added = g.add_node({OpenDataflowValue{n0_output}}, 1);
+    NodeAddedResult n1_added = g.add_node({OpenDataflowValue{n0_output}}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput n1_output = get_only(n1_added.outputs);
 
@@ -44,7 +44,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                     new_i0,
                     DataflowInput{
                         n0,
-                        0,
+                        0_n,
                     },
                 },
             },
@@ -52,11 +52,11 @@ TEST_SUITE(FF_TEST_SUITE) {
                 DataflowEdge{
                     DataflowOutput{
                         n0,
-                        0,
+                        0_n,
                     },
                     DataflowInput{
                         n1,
-                        0,
+                        0_n,
                     },
                 },
             },
@@ -65,11 +65,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         {
             DataflowOutput{
                 n0,
-                0,
+                0_n,
             },
             DataflowOutput{
                 n1,
-                0,
+                0_n,
             },
         },
     };
diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_node_ids.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_node_ids.cc
index 36bcd16dad..1e7ad87d88 100644
--- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_node_ids.cc
+++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_node_ids.cc
@@ -17,12 +17,12 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     DataflowGraphInput i0 = g.add_input();
 
-    NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1);
+    NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1_n);
     Node n0 = n0_added.node;
     DataflowOutput n0_output = get_only(n0_added.outputs);
 
     NodeAddedResult n1_added =
-        g.add_node({OpenDataflowValue{i0}, OpenDataflowValue{n0_output}}, 1);
+        g.add_node({OpenDataflowValue{i0}, OpenDataflowValue{n0_output}}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput n1_output = get_only(n1_added.outputs);
 
@@ -45,7 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                     i0,
                     DataflowInput{
                         new_node0,
-                        0,
+                        0_n,
                     },
                 },
             },
@@ -54,7 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                     i0,
                     DataflowInput{
                         new_node1,
-                        0,
+                        0_n,
                     },
                 },
             },
@@ -62,11 +62,11 @@ TEST_SUITE(FF_TEST_SUITE) {
                 DataflowEdge{
                     DataflowOutput{
                         new_node0,
-                        0,
+                        0_n,
                     },
                     DataflowInput{
                         new_node1,
-                        1,
+                        1_n,
                     },
                 },
             },
@@ -75,11 +75,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         {
             DataflowOutput{
                 new_node0,
-                0,
+                0_n,
             },
             DataflowOutput{
                 new_node1,
-                0,
+                0_n,
             },
         },
     };
@@ -109,9 +109,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("check access to old edges") {
         OpenDataflowEdgeQuery query = OpenDataflowEdgeQuery{
             dataflow_input_edge_query_for_edge(
-                DataflowInputEdge{i0, DataflowInput{n0, 0}}),
+                DataflowInputEdge{i0, DataflowInput{n0, 0_n}}),
             dataflow_edge_query_for_edge(
-                DataflowEdge{n0_output, DataflowInput{n1, 1}}),
+                DataflowEdge{n0_output, DataflowInput{n1, 1_n}}),
         };
         std::unordered_set<OpenDataflowEdge> result_nodes =
             result.query_edges(query);
@@ -121,12 +121,12 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       SUBCASE("check access to new edges") {
         DataflowEdge new_standard_edge = DataflowEdge{
-            DataflowOutput{new_node0, 0},
-            DataflowInput{new_node1, 1},
+            DataflowOutput{new_node0, 0_n},
+            DataflowInput{new_node1, 1_n},
         };
         DataflowInputEdge new_input_edge = DataflowInputEdge{
             i0,
-            DataflowInput{new_node0, 0},
+            DataflowInput{new_node0, 0_n},
         };
         OpenDataflowEdgeQuery query = OpenDataflowEdgeQuery{
             dataflow_input_edge_query_for_edge(new_input_edge),
@@ -159,7 +159,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("check access to new outputs") {
-        DataflowOutput new_output = DataflowOutput{new_node0, 0};
+        DataflowOutput new_output = DataflowOutput{new_node0, 0_n};
 
         DataflowOutputQuery query =
             dataflow_output_query_for_output(new_output);
diff --git a/lib/utils/test/src/utils/graph/series_parallel/parallel_reduction.cc b/lib/utils/test/src/utils/graph/series_parallel/parallel_reduction.cc
index a62f528bcf..a2f818b5e9 100644
--- a/lib/utils/test/src/utils/graph/series_parallel/parallel_reduction.cc
+++ b/lib/utils/test/src/utils/graph/series_parallel/parallel_reduction.cc
@@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("find_parallel_reduction") {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
     SUBCASE("base case") {
-      std::vector<Node> n = add_nodes(g, 2);
+      std::vector<Node> n = add_nodes(g, 2_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("does not apply when there is only one edge") {
-      std::vector<Node> n = add_nodes(g, 2);
+      std::vector<Node> n = add_nodes(g, 2_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -40,7 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("requires both ends be the same") {
-      std::vector<Node> n = add_nodes(g, 3);
+      std::vector<Node> n = add_nodes(g, 3_n);
       SUBCASE("branch out") {
         std::vector<MultiDiEdge> e = add_edges(g,
                                                {
@@ -67,7 +67,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("finds one reduction when there are multiple") {
-      std::vector<Node> n = add_nodes(g, 2);
+      std::vector<Node> n = add_nodes(g, 2_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -86,7 +86,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("in larger graph") {
-      std::vector<Node> n = add_nodes(g, 5);
+      std::vector<Node> n = add_nodes(g, 5_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -109,7 +109,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
 
     SUBCASE("base case") {
-      std::vector<Node> n = add_nodes(g, 2);
+      std::vector<Node> n = add_nodes(g, 2_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -142,7 +142,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("in larger graph") {
-      std::vector<Node> n = add_nodes(g, 5);
+      std::vector<Node> n = add_nodes(g, 5_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
diff --git a/lib/utils/test/src/utils/graph/series_parallel/series_reduction.cc b/lib/utils/test/src/utils/graph/series_parallel/series_reduction.cc
index c6b45ec6ce..4bb57aeb0d 100644
--- a/lib/utils/test/src/utils/graph/series_parallel/series_reduction.cc
+++ b/lib/utils/test/src/utils/graph/series_parallel/series_reduction.cc
@@ -12,7 +12,7 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_pre/post/center_node") {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
-    std::vector<Node> n = add_nodes(g, 3);
+    std::vector<Node> n = add_nodes(g, 3_n);
     std::vector<MultiDiEdge> e = add_edges(g,
                                            {
                                                {n.at(0), n.at(1)},
@@ -42,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("find_series_reduction") {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
     SUBCASE("base case") {
-      std::vector<Node> n = add_nodes(g, 3);
+      std::vector<Node> n = add_nodes(g, 3_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -57,7 +57,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("does not find if other edges are involved with center node") {
       SUBCASE("duplicate edge") {
-        std::vector<Node> n = add_nodes(g, 3);
+        std::vector<Node> n = add_nodes(g, 3_n);
         std::vector<MultiDiEdge> e = add_edges(g,
                                                {
                                                    {n.at(0), n.at(1)},
@@ -71,7 +71,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("misc edge") {
-        std::vector<Node> n = add_nodes(g, 4);
+        std::vector<Node> n = add_nodes(g, 4_n);
         std::vector<MultiDiEdge> e = add_edges(g,
                                                {
                                                    {n.at(0), n.at(1)},
@@ -86,7 +86,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("does find if other edges are involved with non-center node") {
-      std::vector<Node> n = add_nodes(g, 4);
+      std::vector<Node> n = add_nodes(g, 4_n);
       SUBCASE("edge from dst") {
         std::vector<MultiDiEdge> e = add_edges(g,
                                                {
@@ -107,7 +107,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("finds one reduction when there are multiple") {
-      std::vector<Node> n = add_nodes(g, 4);
+      std::vector<Node> n = add_nodes(g, 4_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -125,7 +125,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("in larger graph") {
-      std::vector<Node> n = add_nodes(g, 8);
+      std::vector<Node> n = add_nodes(g, 8_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(2)},
@@ -149,7 +149,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
 
     SUBCASE("base case") {
-      std::vector<Node> n = add_nodes(g, 3);
+      std::vector<Node> n = add_nodes(g, 3_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -188,7 +188,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("in larger graph") {
-      std::vector<Node> n = add_nodes(g, 8);
+      std::vector<Node> n = add_nodes(g, 8_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(2)},
diff --git a/lib/utils/test/src/utils/nonnegative_int/ceildiv.cc b/lib/utils/test/src/utils/nonnegative_int/ceildiv.cc
new file mode 100644
index 0000000000..7ac882ff9f
--- /dev/null
+++ b/lib/utils/test/src/utils/nonnegative_int/ceildiv.cc
@@ -0,0 +1,52 @@
+#include "utils/nonnegative_int/ceildiv.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("ceildiv(nonnegative_int, nonnegative_int)") {
+    SUBCASE("divides evenly") {
+      nonnegative_int numerator = 12_n;
+      nonnegative_int denominator = 3_n;
+
+      nonnegative_int result = ceildiv(numerator, denominator);
+      nonnegative_int correct = 4_n;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("does not divide evenly") {
+      nonnegative_int numerator = 17_n;
+      nonnegative_int denominator = 4_n;
+
+      nonnegative_int result = ceildiv(numerator, denominator);
+      nonnegative_int correct = 5_n;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("denominator is zero") {
+      nonnegative_int numerator = 15_n;
+      nonnegative_int denominator = 0_n;
+
+      CHECK_THROWS(ceildiv(numerator, denominator));
+    }
+
+    SUBCASE("numerator is zero") {
+      nonnegative_int numerator = 0_n;
+      nonnegative_int denominator = 1_n;
+
+      nonnegative_int result = ceildiv(numerator, denominator);
+      nonnegative_int correct = 0_n;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("denominator and numerator are zero") {
+      nonnegative_int numerator = 0_n;
+      nonnegative_int denominator = 0_n;
+
+      CHECK_THROWS(ceildiv(numerator, denominator));
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/nonnegative_int/nonnegative_int.cc b/lib/utils/test/src/utils/nonnegative_int/nonnegative_int.cc
index 73d382d830..dfde11f9bd 100644
--- a/lib/utils/test/src/utils/nonnegative_int/nonnegative_int.cc
+++ b/lib/utils/test/src/utils/nonnegative_int/nonnegative_int.cc
@@ -198,13 +198,89 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
   }
 
-  TEST_CASE("nonnegative_int + operation") {
-    nonnegative_int nn_int_1a = nonnegative_int{1};
-    nonnegative_int nn_int_1b = nonnegative_int{1};
-    nonnegative_int nn_int_2 = nonnegative_int{2};
-    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int") {
-      CHECK(nn_int_1a + nn_int_1b == nn_int_2);
-    }
+  TEST_CASE("nonnegative_int::operator+(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{1} + nonnegative_int{2};
+    nonnegative_int correct = nonnegative_int{3};
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator++() (pre-increment)") {
+    nonnegative_int input = nonnegative_int{1};
+
+    nonnegative_int result = ++input;
+    nonnegative_int correct = nonnegative_int{2};
+
+    CHECK(result == correct);
+    CHECK(input == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator++(int) (post-increment)") {
+    nonnegative_int input = nonnegative_int{1};
+
+    nonnegative_int result = input++;
+    nonnegative_int correct_input = nonnegative_int{2};
+    nonnegative_int correct_result = nonnegative_int{1};
+
+    CHECK(result == correct_result);
+    CHECK(input == correct_input);
+  }
+
+  TEST_CASE("nonnegative_int::operator+=(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{1};
+    result += nonnegative_int{3};
+
+    nonnegative_int correct = nonnegative_int{4};
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator*(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{2} * nonnegative_int{3};
+    nonnegative_int correct = nonnegative_int{6};
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator*=(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{3};
+    result *= nonnegative_int{6};
+
+    nonnegative_int correct = nonnegative_int{18};
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator/(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{5} / nonnegative_int{2};
+    nonnegative_int correct = nonnegative_int{2};
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator/=(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{13};
+    result /= nonnegative_int{3};
+
+    nonnegative_int correct = nonnegative_int{4};
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator%(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{5} % nonnegative_int{2};
+    nonnegative_int correct = nonnegative_int{1};
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator%=(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{15};
+    result %= nonnegative_int{4};
+
+    nonnegative_int correct = nonnegative_int{3};
+
+    CHECK(result == correct);
   }
 
   TEST_CASE("adl_serializer<nonnegative_int>") {
diff --git a/lib/utils/test/src/utils/nonnegative_int/nonnegative_range.cc b/lib/utils/test/src/utils/nonnegative_int/nonnegative_range.cc
new file mode 100644
index 0000000000..db8fca295e
--- /dev/null
+++ b/lib/utils/test/src/utils/nonnegative_int/nonnegative_range.cc
@@ -0,0 +1,42 @@
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "test/utils/doctest/fmt/vector.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("nonnegative_range(nonnegative_int)") {
+    SUBCASE("bound is greater than zero") {
+      std::vector<nonnegative_int> result =
+          nonnegative_range(nonnegative_int{3});
+      std::vector<nonnegative_int> correct = {
+          nonnegative_int{0},
+          nonnegative_int{1},
+          nonnegative_int{2},
+      };
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("bound is zero") {
+      std::vector<nonnegative_int> result =
+          nonnegative_range(nonnegative_int{0});
+      std::vector<nonnegative_int> correct = {};
+
+      CHECK(result == correct);
+    }
+  }
+
+  TEST_CASE("nonnegative_range(nonnegative_int, nonnegative_int, int)") {
+    std::vector<nonnegative_int> result = nonnegative_range(
+        /*start=*/nonnegative_int{7},
+        /*end=*/nonnegative_int{3},
+        /*step=*/-2);
+    std::vector<nonnegative_int> correct = {
+        nonnegative_int{7},
+        nonnegative_int{5},
+    };
+
+    CHECK(result == correct);
+  }
+}
diff --git a/lib/utils/test/src/utils/nonnegative_int/num_elements.cc b/lib/utils/test/src/utils/nonnegative_int/num_elements.cc
new file mode 100644
index 0000000000..0878be0410
--- /dev/null
+++ b/lib/utils/test/src/utils/nonnegative_int/num_elements.cc
@@ -0,0 +1,15 @@
+#include "utils/nonnegative_int/num_elements.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("num_elements") {
+    std::vector<int> input = {-1, 3, 3, 1};
+
+    nonnegative_int result = num_elements(input);
+    nonnegative_int correct = nonnegative_int{4};
+
+    CHECK(result == correct);
+  }
+}
diff --git a/lib/utils/test/src/utils/random_utils.cc b/lib/utils/test/src/utils/random_utils.cc
index 8e7d22138f..fdc48a64dd 100644
--- a/lib/utils/test/src/utils/random_utils.cc
+++ b/lib/utils/test/src/utils/random_utils.cc
@@ -29,7 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("correct distribution") {
       auto check_probabilities = [](std::vector<int> const &values,
                                     std::vector<float> const &weights) {
-        int num_iterations = 10'000;
+        nonnegative_int num_iterations = 10'000_n;
         std::vector<int> trials = repeat(
             num_iterations, [&]() { return select_random(values, weights); });
 
@@ -39,8 +39,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           float expectedProbability = w / sum(weights);
           int num_occurrences =
               filter(trials, [&](int c) { return (c == v); }).size();
-          float observedProbability =
-              static_cast<float>(num_occurrences) / num_iterations;
+          float observedProbability = static_cast<float>(num_occurrences) /
+                                      num_iterations.unwrap_nonnegative();
           CHECK(observedProbability ==
                 doctest::Approx(expectedProbability).epsilon(0.01f));
         }

From fe339ebc140319f97049a17a81e3380269f69188 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Thu, 11 Jul 2024 14:33:42 -0700
Subject: [PATCH 27/42] test_utils refactor, local_cpu_allocator

---
 lib/kernels/test/src/test_replicate_kernel.cc |  2 +-
 lib/kernels/test/src/test_utils.cc            | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 5133c4c89c..357d1958c0 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -48,7 +48,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") {
-    nonnegative_int num_replicas = 10_n;
+    nonnegative_int num_replicas = 2_n;
 
     TensorShape input_shape =
         make_tensor_shape_from_legion_dims({5_n}, DataType::FLOAT);
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
index a15447446a..70cca5f2f0 100644
--- a/lib/kernels/test/src/test_utils.cc
+++ b/lib/kernels/test/src/test_utils.cc
@@ -99,7 +99,8 @@ struct CPUAccessorRContainsNonZero {
 
     T const *data_ptr = accessor.get<DT>();
 
-    for (size_t i = 0; i < accessor.shape.num_elements(); i++) {
+    int volume = accessor.shape.num_elements().unwrap_nonnegative();
+    for (size_t i = 0; i < volume; i++) {
       if (data_ptr[i] != 0) {
         return true;
       }
@@ -178,7 +179,8 @@ struct AccessorsAreEqual {
     T const *a_data_ptr = cpu_accessor_a.get<DT>();
     T const *b_data_ptr = cpu_accessor_b.get<DT>();
 
-    for (size_t i = 0; i < accessor_a.shape.num_elements(); i++) {
+    int volume = accessor_a.shape.num_elements().unwrap_nonnegative();
+    for (size_t i = 0; i < volume; i++) {
       if (a_data_ptr[i] != b_data_ptr[i]) {
         return false;
       }
@@ -218,7 +220,9 @@ struct CreateFilledAccessorW {
     GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape);
 
     T *data_ptr = src_accessor.get<DT>();
-    for (size_t i = 0; i < dst_accessor.shape.num_elements(); i++) {
+
+    int volume = dst_accessor.shape.num_elements().unwrap_nonnegative();
+    for (size_t i = 0; i < volume; i++) {
       data_ptr[i] = unwrapped_value;
     }
 

From 2e2ae131b3d2fa1a11278e5e3482ceedd47f780c Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Fri, 12 Jul 2024 12:54:48 -0700
Subject: [PATCH 28/42] test utils modification, cast, reverse, and replicate
 cpu kernels

---
 lib/kernels/src/cuda/ops/reverse_kernels.cu   |  36 +++++-
 lib/kernels/test/src/test_cast_kernel.cc      |  56 ++++++++++
 lib/kernels/test/src/test_replicate_kernel.cc |  86 ++++++++++++++
 lib/kernels/test/src/test_reverse_kernels.cc  | 105 ++++++++++++++++++
 4 files changed, 277 insertions(+), 6 deletions(-)

diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu
index 2c25293c36..c750819266 100644
--- a/lib/kernels/src/cuda/ops/reverse_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu
@@ -20,6 +20,29 @@ namespace FlexFlow {
 namespace Kernels {
 namespace Reverse {
 
+// __global__ void reverse_forward_kernel(float const *in_ptr,
+//                                        float *out_ptr,
+//                                        coord_t num_out_blks,
+//                                        coord_t reverse_dim_size,
+//                                        coord_t in_blk_size) {
+//   CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
+//     coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
+//     i = i - blk_idx * (reverse_dim_size * in_blk_size);
+//     coord_t reverse_dim_idx = i / in_blk_size;
+//     i = i - reverse_dim_idx * in_blk_size;
+//     coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
+//                      (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size +
+//                      i;
+//     out_ptr[i] = in_ptr[in_idx];
+//   }
+// }
+
+/* I mentioned this earlier, but I still think the reverse_forward_kernel code
+   is incorrect, even though it matches the code in inference/master? Whenever
+   I'm testing the code and printing out the output, I'm getting unexpected
+   outputs, and I think it's a result of modifying the loop index i in the
+   previous code?
+*/
 __global__ void reverse_forward_kernel(float const *in_ptr,
                                        float *out_ptr,
                                        coord_t num_out_blks,
@@ -27,12 +50,13 @@ __global__ void reverse_forward_kernel(float const *in_ptr,
                                        coord_t in_blk_size) {
   CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
     coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
-    i = i - blk_idx * (reverse_dim_size * in_blk_size);
-    coord_t reverse_dim_idx = i / in_blk_size;
-    i = i - reverse_dim_idx * in_blk_size;
-    coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
-                     (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i;
-    out_ptr[i] = in_ptr[in_idx];
+    coord_t idx_within_blk = i % (reverse_dim_size * in_blk_size);
+    coord_t reverse_dim_idx = idx_within_blk / in_blk_size;
+    coord_t in_idx = idx_within_blk % in_blk_size;
+    coord_t input_index =
+        blk_idx * (reverse_dim_size * in_blk_size) +
+        (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + in_idx;
+    out_ptr[i] = in_ptr[input_index];
   }
 }
 
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index 2ac27a9747..10e3ef791b 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -75,4 +75,60 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
     }
   }
+
+  TEST_CASE("Check Cast Forward Kernel against CPU Kernel") {
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims<DataType::FLOAT>({100, 100});
+    TensorShape output_shape =
+        make_tensor_shape_from_legion_dims<DataType::INT32>({100, 100});
+
+    GenericTensorAccessorW output_accessor_gpu =
+        gpu_allocator.allocate_tensor(output_shape);
+    GenericTensorAccessorW output_accessor_cpu =
+        cpu_allocator.allocate_tensor(output_shape);
+
+    // Only calling forward kernel as backward kernel is exactly the same
+    SUBCASE("forward_kernel") {
+      auto transform = [start_val = 1.1f,
+                        counter = 0.0f](float input) mutable -> float {
+        return start_val + counter++;
+      };
+
+      // Run GPU Forward Kernel
+      GenericTensorAccessorW input_accessor_gpu =
+          create_transformed_accessor_w<float, float>(
+              input_shape, gpu_allocator, transform, false);
+      Kernels::Cast::forward_kernel(
+          managed_stream.raw_stream(),
+          read_only_accessor_from_write_accessor(input_accessor_gpu),
+          output_accessor_gpu,
+          DataType::FLOAT,
+          DataType::INT32);
+      std::vector<int32_t> result_data_gpu =
+          load_accessor_data<DataType::INT32>(
+              read_only_accessor_from_write_accessor(output_accessor_gpu),
+              true);
+
+      // Run CPU Forward Kernel
+      GenericTensorAccessorW input_accessor_cpu =
+          create_transformed_accessor_w<float, float>(
+              input_shape, cpu_allocator, transform, true);
+      Kernels::Cast::CPU::forward_kernel(
+          read_only_accessor_from_write_accessor(input_accessor_cpu),
+          output_accessor_cpu,
+          DataType::FLOAT,
+          DataType::INT32);
+      std::vector<int32_t> result_data_cpu =
+          load_accessor_data<DataType::INT32>(
+              read_only_accessor_from_write_accessor(output_accessor_cpu),
+              false);
+
+      CHECK(result_data_gpu == result_data_cpu);
+    }
+  }
 }
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 357d1958c0..8bb6086543 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -110,4 +110,90 @@ TEST_SUITE(FF_TEST_SUITE) {
                                 input_grad_accessor_cpu));
     }
   }
+
+  TEST_CASE("Check Replicate Forward Kernel against CPU Kernel") {
+    std::size_t num_replicas = 10;
+
+    // This should be like three shapes: pre_replication, replication shape, and
+    // reduced shape, but things are weird cause doesn't seem to be replicating
+    // anything
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims<DataType::FLOAT>({10, num_replicas});
+    TensorShape replicated_shape =
+        make_tensor_shape_from_legion_dims<DataType::FLOAT>({10, num_replicas});
+    TensorShape reduced_shape =
+        make_tensor_shape_from_legion_dims<DataType::FLOAT>({10});
+
+    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("forward_kernel") {
+      // Run GPU Replicate Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          read_only_accessor_from_write_accessor(
+              create_random_filled_accessor_w(input_shape, gpu_allocator));
+      GenericTensorAccessorW output_accessor_gpu =
+          gpu_allocator.allocate_tensor(replicated_shape);
+
+      Kernels::Replicate::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(output_accessor_gpu), true);
+
+      // Run CPU Replicate Forward Kernel
+      GenericTensorAccessorW input_accessor_cpu =
+          copy_tensor_between_memories<DataType::FLOAT>(
+              input_accessor_gpu, input_shape, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          cpu_allocator.allocate_tensor(replicated_shape);
+
+      Kernels::Replicate::CPU::forward_kernel(
+          read_only_accessor_from_write_accessor(input_accessor_cpu),
+          output_accessor_cpu);
+
+      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(output_accessor_cpu), false);
+
+      CHECK(result_data_gpu == result_data_cpu);
+    }
+
+    SUBCASE("backward_kernel") {
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          read_only_accessor_from_write_accessor(
+              create_random_filled_accessor_w(replicated_shape, gpu_allocator));
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          gpu_allocator.allocate_tensor(reduced_shape);
+
+      Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
+                                          input_grad_accessor_gpu,
+                                          output_grad_accessor_gpu,
+                                          num_replicas);
+
+      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(input_grad_accessor_gpu),
+          true);
+
+      GenericTensorAccessorW output_grad_accessor_cpu =
+          copy_tensor_between_memories<DataType::FLOAT>(
+              output_grad_accessor_gpu, replicated_shape, cpu_allocator);
+
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          cpu_allocator.allocate_tensor(reduced_shape);
+
+      Kernels::Replicate::CPU::backward_kernel(
+          input_grad_accessor_cpu,
+          read_only_accessor_from_write_accessor(output_grad_accessor_cpu),
+          num_replicas);
+
+      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(input_grad_accessor_cpu),
+          false);
+
+      CHECK(result_data_gpu == result_data_cpu);
+    }
+  }
 }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index bf23188a8f..b865792f3f 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -137,4 +137,109 @@ TEST_SUITE(FF_TEST_SUITE) {
                                 input_grad_accessor_cpu));
     }
   }
+
+  TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") {
+    std::size_t num_out_blks = 2;
+    std::size_t reverse_dim_size = 3;
+    std::size_t in_blk_size = 5;
+
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims<DataType::FLOAT>(
+            {num_out_blks, reverse_dim_size, in_blk_size});
+    TensorShape output_shape = input_shape;
+
+    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("forward_kernel") {
+      auto transform = [counter = 0.0f](float val) mutable {
+        return counter++;
+      };
+
+      // Run GPU Cast Forward Kernel
+      GenericTensorAccessorW input_accessor_gpu =
+          create_transformed_accessor_w<float, float>(
+              input_shape, gpu_allocator, transform, false);
+      GenericTensorAccessorW output_accessor_gpu =
+          gpu_allocator.allocate_tensor(output_shape);
+
+      Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
+                                       input_accessor_gpu.get_float_ptr(),
+                                       output_accessor_gpu.get_float_ptr(),
+                                       num_out_blks,
+                                       reverse_dim_size,
+                                       in_blk_size,
+                                       input_accessor_gpu.shape.num_elements());
+
+      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(output_accessor_gpu), true);
+
+      // Run CPU Cast Forward Kernel
+      GenericTensorAccessorW input_accessor_cpu =
+          create_transformed_accessor_w<float, float>(
+              input_shape, cpu_allocator, transform, true);
+      GenericTensorAccessorW output_accessor_cpu =
+          cpu_allocator.allocate_tensor(output_shape);
+
+      Kernels::Reverse::CPU::forward_kernel(
+          input_accessor_cpu.get_float_ptr(),
+          output_accessor_cpu.get_float_ptr(),
+          num_out_blks,
+          reverse_dim_size,
+          in_blk_size,
+          input_accessor_cpu.shape.num_elements());
+
+      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(output_accessor_cpu), false);
+
+      CHECK(result_data_gpu == result_data_cpu);
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Cast Backward Kernel
+      GenericTensorAccessorW output_grad_accessor_gpu =
+          create_random_filled_accessor_w(output_shape, gpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          gpu_allocator.allocate_tensor(input_shape);
+
+      Kernels::Reverse::backward_kernel(
+          managed_stream.raw_stream(),
+          output_grad_accessor_gpu.get_float_ptr(),
+          input_grad_accessor_gpu.get_float_ptr(),
+          num_out_blks,
+          reverse_dim_size,
+          in_blk_size,
+          input_grad_accessor_gpu.shape.num_elements());
+
+      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(input_grad_accessor_gpu),
+          true);
+
+      // Run CPU Cast Backward Kernel
+      GenericTensorAccessorW output_grad_accessor_cpu =
+          copy_tensor_between_memories<DataType::FLOAT>(
+              read_only_accessor_from_write_accessor(output_grad_accessor_gpu),
+              output_shape,
+              cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          cpu_allocator.allocate_tensor(input_shape);
+
+      Kernels::Reverse::CPU::backward_kernel(
+          output_grad_accessor_cpu.get_float_ptr(),
+          input_grad_accessor_cpu.get_float_ptr(),
+          num_out_blks,
+          reverse_dim_size,
+          in_blk_size,
+          input_grad_accessor_cpu.shape.num_elements());
+
+      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
+          read_only_accessor_from_write_accessor(input_grad_accessor_cpu),
+          false);
+
+      CHECK(result_data_gpu == result_data_cpu);
+    }
+  }
 }

From 6c30466fc3980e6c5d169ec35a89f62720ed61e2 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Sun, 14 Jul 2024 15:45:59 -0700
Subject: [PATCH 29/42] combine kernel

---
 lib/kernels/src/local_cuda_allocator.cc       |  1 +
 lib/kernels/test/src/test_cast_kernel.cc      |  4 ++--
 lib/kernels/test/src/test_replicate_kernel.cc | 13 +++++++------
 lib/kernels/test/src/test_reverse_kernels.cc  |  8 ++++----
 4 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc
index 416768a479..c72020acb2 100644
--- a/lib/kernels/src/local_cuda_allocator.cc
+++ b/lib/kernels/src/local_cuda_allocator.cc
@@ -6,6 +6,7 @@ namespace FlexFlow {
 void *LocalCudaAllocator::allocate(size_t requested_memory_size) {
   void *ptr;
   checkCUDA(cudaMalloc(&ptr, requested_memory_size));
+  checkCUDA(cudaMemset(ptr, 0, requested_memory_size));
   this->ptrs.insert(ptr);
   return ptr;
 }
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index 10e3ef791b..77d602a89d 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -112,7 +112,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<int32_t> result_data_gpu =
           load_accessor_data<DataType::INT32>(
               read_only_accessor_from_write_accessor(output_accessor_gpu),
-              true);
+              false);
 
       // Run CPU Forward Kernel
       GenericTensorAccessorW input_accessor_cpu =
@@ -126,7 +126,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<int32_t> result_data_cpu =
           load_accessor_data<DataType::INT32>(
               read_only_accessor_from_write_accessor(output_accessor_cpu),
-              false);
+              true);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 8bb6086543..fc61458568 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -116,7 +116,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     // This should be like three shapes: pre_replication, replication shape, and
     // reduced shape, but things are weird cause doesn't seem to be replicating
-    // anything
+    // anything (ie. input shape should be same as reduced shape)
     TensorShape input_shape =
         make_tensor_shape_from_legion_dims<DataType::FLOAT>({10, num_replicas});
     TensorShape replicated_shape =
@@ -142,7 +142,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
 
       std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_gpu), true);
+          read_only_accessor_from_write_accessor(output_accessor_gpu), false);
 
       // Run CPU Replicate Forward Kernel
       GenericTensorAccessorW input_accessor_cpu =
@@ -156,12 +156,13 @@ TEST_SUITE(FF_TEST_SUITE) {
           output_accessor_cpu);
 
       std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_cpu), false);
+          read_only_accessor_from_write_accessor(output_accessor_cpu), true);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
 
     SUBCASE("backward_kernel") {
+      // Run GPU Replicate Backward Kernel
       GenericTensorAccessorR output_grad_accessor_gpu =
           read_only_accessor_from_write_accessor(
               create_random_filled_accessor_w(replicated_shape, gpu_allocator));
@@ -175,12 +176,12 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
           read_only_accessor_from_write_accessor(input_grad_accessor_gpu),
-          true);
+          false);
 
+      // Run CPU Replicate Backward Kernel
       GenericTensorAccessorW output_grad_accessor_cpu =
           copy_tensor_between_memories<DataType::FLOAT>(
               output_grad_accessor_gpu, replicated_shape, cpu_allocator);
-
       GenericTensorAccessorW input_grad_accessor_cpu =
           cpu_allocator.allocate_tensor(reduced_shape);
 
@@ -191,7 +192,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
           read_only_accessor_from_write_accessor(input_grad_accessor_cpu),
-          false);
+          true);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index b865792f3f..d1c5274dc8 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -175,7 +175,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        input_accessor_gpu.shape.num_elements());
 
       std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_gpu), true);
+          read_only_accessor_from_write_accessor(output_accessor_gpu), false);
 
       // Run CPU Cast Forward Kernel
       GenericTensorAccessorW input_accessor_cpu =
@@ -193,7 +193,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           input_accessor_cpu.shape.num_elements());
 
       std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_cpu), false);
+          read_only_accessor_from_write_accessor(output_accessor_cpu), true);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
@@ -216,7 +216,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
           read_only_accessor_from_write_accessor(input_grad_accessor_gpu),
-          true);
+          false);
 
       // Run CPU Cast Backward Kernel
       GenericTensorAccessorW output_grad_accessor_cpu =
@@ -237,7 +237,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
           read_only_accessor_from_write_accessor(input_grad_accessor_cpu),
-          false);
+          true);
 
       CHECK(result_data_gpu == result_data_cpu);
     }

From 5b5c591ab1b7d209775485ee69af3239bd769fa8 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Wed, 31 Jul 2024 04:49:13 -0700
Subject: [PATCH 30/42] test utils logic cleanup, reverse cpu_kernel
 pedagogical implmentation, other minor fixes

---
 lib/kernels/include/kernels/allocation.h      |  6 +++
 .../include/kernels/local_cpu_allocator.h     |  1 +
 .../include/kernels/local_cuda_allocator.h    |  1 +
 lib/kernels/src/allocation.cc                 |  4 ++
 lib/kernels/src/local_cpu_allocator.cc        |  3 +-
 lib/kernels/src/local_cuda_allocator.cc       |  7 +++
 lib/kernels/test/src/test_attention_kernel.cc | 26 +++++++----
 .../test/src/test_batch_matmul_kernel.cc      | 12 +++--
 .../test/src/test_batch_norm_kernel.cc        | 15 ++++--
 lib/kernels/test/src/test_cast_kernel.cc      | 25 +++++-----
 lib/kernels/test/src/test_dropout.cc          |  6 ++-
 lib/kernels/test/src/test_gather_kernels.cc   |  3 +-
 .../test/src/test_layer_norm_kernels.cc       |  3 +-
 lib/kernels/test/src/test_pool_2d_kernels.cc  |  6 ++-
 lib/kernels/test/src/test_replicate_kernel.cc | 46 +++++++++----------
 lib/kernels/test/src/test_reverse_kernels.cc  | 38 ++++++++-------
 lib/kernels/test/src/test_softmax_kernel.cc   |  6 ++-
 lib/kernels/test/src/test_split_kernel.cc     |  6 ++-
 lib/kernels/test/src/test_transpose_kernel.cc |  3 +-
 .../local-execution/tracked_allocator.h       |  1 +
 lib/local-execution/src/tracked_allocator.cc  |  6 +++
 21 files changed, 138 insertions(+), 86 deletions(-)

diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h
index 4bf97118ce..7a8b844cf4 100644
--- a/lib/kernels/include/kernels/allocation.h
+++ b/lib/kernels/include/kernels/allocation.h
@@ -5,10 +5,13 @@
 #include <cstddef>
 #include <memory>
 
+enum class AllocLocation { HOST, DEVICE };
+
 namespace FlexFlow {
 
 struct IAllocator {
   virtual void *allocate(size_t) = 0;
+  virtual void *allocate_and_zero(size_t) = 0;
   virtual void deallocate(void *) = 0;
 
   virtual DeviceType get_allocation_device_type() const = 0;
@@ -22,6 +25,7 @@ struct Allocator {
   GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape);
 
   void *allocate(size_t mem_size);
+  void *allocate_and_zero(size_t mem_size);
   void deallocate(void *ptr);
 
   DeviceType get_allocation_device_type() const;
@@ -35,6 +39,8 @@ struct Allocator {
 
   Allocator(std::shared_ptr<IAllocator> ptr) : i_allocator(ptr){};
 
+  AllocLocation alloc_location;
+
 private:
   std::shared_ptr<IAllocator> i_allocator;
 };
diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h
index cf6cfe35d1..c18d43683e 100644
--- a/lib/kernels/include/kernels/local_cpu_allocator.h
+++ b/lib/kernels/include/kernels/local_cpu_allocator.h
@@ -10,6 +10,7 @@ struct LocalCPUAllocator : public IAllocator {
   ~LocalCPUAllocator() = default;
 
   void *allocate(size_t) override;
+  void *allocate_and_zero(size_t) override;
   void deallocate(void *) override;
 
   DeviceType get_allocation_device_type() const override;
diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h
index b8e0540974..fb3a42d864 100644
--- a/lib/kernels/include/kernels/local_cuda_allocator.h
+++ b/lib/kernels/include/kernels/local_cuda_allocator.h
@@ -10,6 +10,7 @@ struct LocalCudaAllocator : public IAllocator {
   ~LocalCudaAllocator() override;
 
   void *allocate(size_t) override;
+  void *allocate_and_zero(size_t) override;
   void deallocate(void *) override;
 
   DeviceType get_allocation_device_type() const override;
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
index bed8daba51..255cf4b7e3 100644
--- a/lib/kernels/src/allocation.cc
+++ b/lib/kernels/src/allocation.cc
@@ -7,6 +7,10 @@ void *Allocator::allocate(size_t mem_size) {
   return this->i_allocator->allocate(mem_size);
 }
 
+void *Allocator::allocate_and_zero(size_t mem_size) {
+  return this->i_allocator->allocate_and_zero(mem_size);
+}
+
 void Allocator::deallocate(void *ptr) {
   this->i_allocator->deallocate(ptr);
 }
diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc
index 5cf337c685..adc31b2c6b 100644
--- a/lib/kernels/src/local_cpu_allocator.cc
+++ b/lib/kernels/src/local_cpu_allocator.cc
@@ -23,8 +23,7 @@ DeviceType LocalCPUAllocator::get_allocation_device_type() const {
 }
 
 Allocator create_local_cpu_memory_allocator() {
-  Allocator allocator = Allocator::create<LocalCPUAllocator>();
-  return allocator;
+  return Allocator::create<LocalCPUAllocator>();
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc
index c72020acb2..666e5cae2e 100644
--- a/lib/kernels/src/local_cuda_allocator.cc
+++ b/lib/kernels/src/local_cuda_allocator.cc
@@ -4,6 +4,13 @@
 
 namespace FlexFlow {
 void *LocalCudaAllocator::allocate(size_t requested_memory_size) {
+  void *ptr;
+  checkCUDA(cudaMalloc(&ptr, requested_memory_size));
+  this->ptrs.insert(ptr);
+  return ptr;
+}
+
+void *LocalCudaAllocator::allocate_and_zero(size_t requested_memory_size) {
   void *ptr;
   checkCUDA(cudaMalloc(&ptr, requested_memory_size));
   checkCUDA(cudaMemset(ptr, 0, requested_memory_size));
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index bd0167a677..ea861c7da9 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -53,13 +53,16 @@ TEST_SUITE(FF_TEST_SUITE) {
         {nonnegative_int{state.weightSize}}, DataType::FLOAT);
 
     GenericTensorAccessorW query_accessor =
-        create_random_filled_accessor_w(query_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(query_shape,
+                                                         allocator);
     GenericTensorAccessorW key_accessor =
-        create_random_filled_accessor_w(key_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(key_shape, allocator);
     GenericTensorAccessorW value_accessor =
-        create_random_filled_accessor_w(value_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(value_shape,
+                                                         allocator);
     GenericTensorAccessorW weight_accessor =
-        create_random_filled_accessor_w(weight_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(weight_shape,
+                                                         allocator);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW output_accessor =
@@ -79,15 +82,20 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW query_grad_accessor =
-          create_random_filled_accessor_w(query_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(query_shape,
+                                                           allocator);
       GenericTensorAccessorW key_grad_accessor =
-          create_random_filled_accessor_w(key_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(key_shape,
+                                                           allocator);
       GenericTensorAccessorW value_grad_accessor =
-          create_random_filled_accessor_w(value_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(value_shape,
+                                                           allocator);
       GenericTensorAccessorW weight_grad_accessor =
-          create_random_filled_accessor_w(weight_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(weight_shape,
+                                                           allocator);
       GenericTensorAccessorW output_grad_accessor =
-          create_random_filled_accessor_w(output_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                           allocator);
 
       Kernels::MultiHeadAttention::backward_kernel(
           managed_stream.raw_stream(),
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index d78d5daee5..63e0909b9a 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -29,11 +29,14 @@ TEST_SUITE(FF_TEST_SUITE) {
         make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT);
 
     GenericTensorAccessorW a_accessor =
-        create_random_filled_accessor_w(input_shape_a, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(input_shape_a,
+                                                         allocator);
     GenericTensorAccessorW b_accessor =
-        create_random_filled_accessor_w(input_shape_b, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(input_shape_b,
+                                                         allocator);
     GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w(output_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                         allocator);
 
     SUBCASE("forward_kernel") {
       Kernels::BatchMatmul::forward_kernel(managed_stream.raw_stream(),
@@ -52,7 +55,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW o_grad_accessor =
-          create_random_filled_accessor_w(output_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                           allocator);
       GenericTensorAccessorW a_grad_accessor =
           allocator.allocate_tensor(input_shape_a);
       GenericTensorAccessorW b_grad_accessor =
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index d0ec2559ba..79331a8539 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -39,7 +39,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         {output_n, output_c, output_h, output_w}, DataType::FLOAT);
 
     GenericTensorAccessorW input_accessor =
-        create_random_filled_accessor_w(input_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                         allocator);
     GenericTensorAccessorW output_accessor =
         create_random_filled_accessor_w(output_shape, allocator);
     GenericTensorAccessorW scale_accessor = create_filled_accessor_w(
@@ -62,13 +63,17 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_accessor =
-          create_random_filled_accessor_w(output_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                           allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           allocator);
       GenericTensorAccessorW scale_grad_accessor =
-          create_random_filled_accessor_w(scale_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(scale_shape,
+                                                           allocator);
       GenericTensorAccessorW bias_grad_accessor =
-          create_random_filled_accessor_w(bias_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(bias_shape,
+                                                           allocator);
 
       Kernels::BatchNorm::backward_kernel(
           /*stream=*/managed_stream.raw_stream(),
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index 77d602a89d..af7f537189 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -83,9 +83,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims<DataType::FLOAT>({100, 100});
+        make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT);
     TensorShape output_shape =
-        make_tensor_shape_from_legion_dims<DataType::INT32>({100, 100});
+        make_tensor_shape_from_legion_dims({100, 100}, DataType::INT32);
 
     GenericTensorAccessorW output_accessor_gpu =
         gpu_allocator.allocate_tensor(output_shape);
@@ -102,31 +102,34 @@ TEST_SUITE(FF_TEST_SUITE) {
       // Run GPU Forward Kernel
       GenericTensorAccessorW input_accessor_gpu =
           create_transformed_accessor_w<float, float>(
-              input_shape, gpu_allocator, transform, false);
+              input_shape, gpu_allocator, transform);
       Kernels::Cast::forward_kernel(
           managed_stream.raw_stream(),
           read_only_accessor_from_write_accessor(input_accessor_gpu),
           output_accessor_gpu,
           DataType::FLOAT,
           DataType::INT32);
+      std::cout << "Before GPU load" << std::endl;
       std::vector<int32_t> result_data_gpu =
-          load_accessor_data<DataType::INT32>(
-              read_only_accessor_from_write_accessor(output_accessor_gpu),
-              false);
+          load_accessor_data<DataType::INT32>(output_accessor_gpu);
 
       // Run CPU Forward Kernel
       GenericTensorAccessorW input_accessor_cpu =
           create_transformed_accessor_w<float, float>(
-              input_shape, cpu_allocator, transform, true);
-      Kernels::Cast::CPU::forward_kernel(
+              input_shape, cpu_allocator, transform);
+      Kernels::Cast::cpu_forward_kernel(
           read_only_accessor_from_write_accessor(input_accessor_cpu),
           output_accessor_cpu,
           DataType::FLOAT,
           DataType::INT32);
+      std::cout << "Before CPU load" << std::endl;
+      if (output_accessor_cpu.on_device) {
+        std::cout << "CPU data is on device" << std::endl;
+      } else {
+        std::cout << "CPU data is on host" << std::endl;
+      }
       std::vector<int32_t> result_data_cpu =
-          load_accessor_data<DataType::INT32>(
-              read_only_accessor_from_write_accessor(output_accessor_cpu),
-              true);
+          load_accessor_data<DataType::INT32>(output_accessor_cpu);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index ad74fa7d36..4bcb37f083 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -47,9 +47,11 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_data =
-          create_random_filled_accessor_w(output_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                           allocator);
       GenericTensorAccessorW input_grad_data =
-          create_random_filled_accessor_w(input_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           allocator);
 
       Kernels::Dropout::backward_kernel(managed_stream.raw_stream(),
                                         state,
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index b75614588c..45005092fe 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -42,7 +42,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           allocator);
 
       Kernels::Gather::backward_kernel(managed_stream.raw_stream(),
                                        state,
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 8368fe4efd..cebf88986d 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -56,7 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           allocator);
       GenericTensorAccessorW gamma_grad_accessor =
           allocator.allocate_tensor(feature_shape);
       GenericTensorAccessorW beta_grad_accessor =
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index ff74f6fb28..74d178bd64 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -55,9 +55,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         {output_w, output_h, output_c, output_n}, DataType::FLOAT);
 
     GenericTensorAccessorW input_accessor =
-        create_random_filled_accessor_w(input_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                         allocator);
     GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w(output_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                         allocator);
 
     SUBCASE("forward_kernel") {
       Kernels::Pool2D::forward_kernel(managed_stream.raw_stream(),
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index fc61458568..9cd59464b9 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -118,11 +118,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     // reduced shape, but things are weird cause doesn't seem to be replicating
     // anything (ie. input shape should be same as reduced shape)
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims<DataType::FLOAT>({10, num_replicas});
+        make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT);
     TensorShape replicated_shape =
-        make_tensor_shape_from_legion_dims<DataType::FLOAT>({10, num_replicas});
+        make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT);
     TensorShape reduced_shape =
-        make_tensor_shape_from_legion_dims<DataType::FLOAT>({10});
+        make_tensor_shape_from_legion_dims({10}, DataType::FLOAT);
 
     ManagedPerDeviceFFHandle managed_handle{};
     ManagedFFStream managed_stream{};
@@ -133,30 +133,30 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("forward_kernel") {
       // Run GPU Replicate Forward Kernel
       GenericTensorAccessorR input_accessor_gpu =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, gpu_allocator));
+          create_random_filled_accessor_r<DataType::FLOAT>(input_shape,
+                                                           gpu_allocator);
       GenericTensorAccessorW output_accessor_gpu =
           gpu_allocator.allocate_tensor(replicated_shape);
 
       Kernels::Replicate::forward_kernel(
           managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
 
-      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_gpu), false);
+      std::vector<float> result_data_gpu =
+          load_accessor_data<DataType::FLOAT>(output_accessor_gpu);
 
       // Run CPU Replicate Forward Kernel
       GenericTensorAccessorW input_accessor_cpu =
-          copy_tensor_between_memories<DataType::FLOAT>(
-              input_accessor_gpu, input_shape, cpu_allocator);
+          copy_tensor_between_memories<DataType::FLOAT>(input_accessor_gpu,
+                                                        cpu_allocator);
       GenericTensorAccessorW output_accessor_cpu =
           cpu_allocator.allocate_tensor(replicated_shape);
 
-      Kernels::Replicate::CPU::forward_kernel(
+      Kernels::Replicate::cpu_forward_kernel(
           read_only_accessor_from_write_accessor(input_accessor_cpu),
           output_accessor_cpu);
 
-      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_cpu), true);
+      std::vector<float> result_data_cpu =
+          load_accessor_data<DataType::FLOAT>(output_accessor_cpu);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
@@ -164,35 +164,33 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("backward_kernel") {
       // Run GPU Replicate Backward Kernel
       GenericTensorAccessorR output_grad_accessor_gpu =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(replicated_shape, gpu_allocator));
+          create_random_filled_accessor_r<DataType::FLOAT>(replicated_shape,
+                                                           gpu_allocator);
       GenericTensorAccessorW input_grad_accessor_gpu =
-          gpu_allocator.allocate_tensor(reduced_shape);
+          gpu_allocator.allocate_tensor_and_zero(reduced_shape);
 
       Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
                                           input_grad_accessor_gpu,
                                           output_grad_accessor_gpu,
                                           num_replicas);
 
-      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(input_grad_accessor_gpu),
-          false);
+      std::vector<float> result_data_gpu =
+          load_accessor_data<DataType::FLOAT>(input_grad_accessor_gpu);
 
       // Run CPU Replicate Backward Kernel
       GenericTensorAccessorW output_grad_accessor_cpu =
           copy_tensor_between_memories<DataType::FLOAT>(
-              output_grad_accessor_gpu, replicated_shape, cpu_allocator);
+              output_grad_accessor_gpu, cpu_allocator);
       GenericTensorAccessorW input_grad_accessor_cpu =
-          cpu_allocator.allocate_tensor(reduced_shape);
+          cpu_allocator.allocate_tensor_and_zero(reduced_shape);
 
-      Kernels::Replicate::CPU::backward_kernel(
+      Kernels::Replicate::cpu_backward_kernel(
           input_grad_accessor_cpu,
           read_only_accessor_from_write_accessor(output_grad_accessor_cpu),
           num_replicas);
 
-      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(input_grad_accessor_cpu),
-          true);
+      std::vector<float> result_data_cpu =
+          load_accessor_data<DataType::FLOAT>(input_grad_accessor_cpu);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index d1c5274dc8..503da33984 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -43,7 +43,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_accessor =
-          create_random_filled_accessor_w(output_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                           allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
@@ -143,9 +144,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     std::size_t reverse_dim_size = 3;
     std::size_t in_blk_size = 5;
 
-    TensorShape input_shape =
-        make_tensor_shape_from_legion_dims<DataType::FLOAT>(
-            {num_out_blks, reverse_dim_size, in_blk_size});
+    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     ManagedPerDeviceFFHandle managed_handle{};
@@ -162,7 +162,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       // Run GPU Cast Forward Kernel
       GenericTensorAccessorW input_accessor_gpu =
           create_transformed_accessor_w<float, float>(
-              input_shape, gpu_allocator, transform, false);
+              input_shape, gpu_allocator, transform);
       GenericTensorAccessorW output_accessor_gpu =
           gpu_allocator.allocate_tensor(output_shape);
 
@@ -174,17 +174,17 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        in_blk_size,
                                        input_accessor_gpu.shape.num_elements());
 
-      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_gpu), false);
+      std::vector<float> result_data_gpu =
+          load_accessor_data<DataType::FLOAT>(output_accessor_gpu);
 
       // Run CPU Cast Forward Kernel
       GenericTensorAccessorW input_accessor_cpu =
           create_transformed_accessor_w<float, float>(
-              input_shape, cpu_allocator, transform, true);
+              input_shape, cpu_allocator, transform);
       GenericTensorAccessorW output_accessor_cpu =
           cpu_allocator.allocate_tensor(output_shape);
 
-      Kernels::Reverse::CPU::forward_kernel(
+      Kernels::Reverse::cpu_forward_kernel(
           input_accessor_cpu.get_float_ptr(),
           output_accessor_cpu.get_float_ptr(),
           num_out_blks,
@@ -192,8 +192,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           in_blk_size,
           input_accessor_cpu.shape.num_elements());
 
-      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(output_accessor_cpu), true);
+      std::vector<float> result_data_cpu =
+          load_accessor_data<DataType::FLOAT>(output_accessor_cpu);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
@@ -201,7 +201,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("backward_kernel") {
       // Run GPU Cast Backward Kernel
       GenericTensorAccessorW output_grad_accessor_gpu =
-          create_random_filled_accessor_w(output_shape, gpu_allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                           gpu_allocator);
       GenericTensorAccessorW input_grad_accessor_gpu =
           gpu_allocator.allocate_tensor(input_shape);
 
@@ -214,20 +215,18 @@ TEST_SUITE(FF_TEST_SUITE) {
           in_blk_size,
           input_grad_accessor_gpu.shape.num_elements());
 
-      std::vector<float> result_data_gpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(input_grad_accessor_gpu),
-          false);
+      std::vector<float> result_data_gpu =
+          load_accessor_data<DataType::FLOAT>(input_grad_accessor_gpu);
 
       // Run CPU Cast Backward Kernel
       GenericTensorAccessorW output_grad_accessor_cpu =
           copy_tensor_between_memories<DataType::FLOAT>(
               read_only_accessor_from_write_accessor(output_grad_accessor_gpu),
-              output_shape,
               cpu_allocator);
       GenericTensorAccessorW input_grad_accessor_cpu =
           cpu_allocator.allocate_tensor(input_shape);
 
-      Kernels::Reverse::CPU::backward_kernel(
+      Kernels::Reverse::cpu_backward_kernel(
           output_grad_accessor_cpu.get_float_ptr(),
           input_grad_accessor_cpu.get_float_ptr(),
           num_out_blks,
@@ -235,9 +234,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           in_blk_size,
           input_grad_accessor_cpu.shape.num_elements());
 
-      std::vector<float> result_data_cpu = load_accessor_data<DataType::FLOAT>(
-          read_only_accessor_from_write_accessor(input_grad_accessor_cpu),
-          true);
+      std::vector<float> result_data_cpu =
+          load_accessor_data<DataType::FLOAT>(input_grad_accessor_cpu);
 
       CHECK(result_data_gpu == result_data_cpu);
     }
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index d4fb496f7b..7e6e95daaf 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -32,11 +32,13 @@ TEST_SUITE(FF_TEST_SUITE) {
                                       input_w.unwrap_nonnegative());
 
     GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w(output_shape, allocator);
+        create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                         allocator);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           allocator);
 
       Kernels::Softmax::forward_kernel(managed_stream.raw_stream(),
                                        state,
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index d98f88a30e..4d3b948714 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -27,7 +27,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           allocator);
 
       std::vector<float *> output_ptrs = repeat(num_outputs, [&]() {
         GenericTensorAccessorW output_accessor =
@@ -48,7 +49,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<float *> output_grad_ptrs(num_outputs.unwrap_nonnegative());
       for (int i = 0; i < num_outputs; i++) {
         GenericTensorAccessorW output_grad_accessor =
-            create_random_filled_accessor_w(output_shape, allocator);
+            create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+                                                             allocator);
         output_grad_ptrs[i] = output_grad_accessor.get_float_ptr();
       }
 
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index cac43c6ff3..c35961b739 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -39,7 +39,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           allocator);
 
       Kernels::Transpose::backward_kernel(managed_stream.raw_stream(),
                                           attrs,
diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h
index f697337c52..31ca2475e2 100644
--- a/lib/local-execution/include/local-execution/tracked_allocator.h
+++ b/lib/local-execution/include/local-execution/tracked_allocator.h
@@ -12,6 +12,7 @@ struct TrackedAllocator : public IAllocator {
   ~TrackedAllocator() = default;
 
   void *allocate(size_t) override;
+  void *allocate_and_zero(size_t) override;
   void deallocate(void *) override;
 
   DeviceType get_allocation_device_type() const override;
diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc
index ed181aea32..7bce6ef304 100644
--- a/lib/local-execution/src/tracked_allocator.cc
+++ b/lib/local-execution/src/tracked_allocator.cc
@@ -12,6 +12,12 @@ void *TrackedAllocator::allocate(size_t requested_memory_size) {
   return ptr;
 }
 
+void *TrackedAllocator::allocate_and_zero(size_t requested_memory_size) {
+  void *ptr = this->allocator.allocate_and_zero(requested_memory_size);
+  this->current_mem_usage += requested_memory_size;
+  return ptr;
+}
+
 void TrackedAllocator::deallocate(void *ptr) {
   size_t psize;
   this->ptr_mem_usage.erase(ptr);

From f0432c393d972bf262fe7153686a10b10cd279e2 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Tue, 8 Oct 2024 00:18:45 -0700
Subject: [PATCH 31/42] cpu_kernel's refactor, generic tensor accessor indexing

---
 lib/kernels/include/kernels/allocation.h      |  6 ---
 .../include/kernels/batch_norm_kernels.h      |  8 +--
 .../include/kernels/local_cpu_allocator.h     |  1 -
 .../include/kernels/local_cuda_allocator.h    |  1 -
 lib/kernels/src/allocation.cc                 |  4 --
 lib/kernels/src/local_cuda_allocator.cc       |  8 ---
 lib/kernels/test/src/test_cast_kernel.cc      | 24 +++------
 lib/kernels/test/src/test_replicate_kernel.cc | 51 ++++++++----------
 lib/kernels/test/src/test_reverse_kernels.cc  | 53 +++++++++----------
 .../local-execution/tracked_allocator.h       |  1 -
 lib/local-execution/src/tracked_allocator.cc  |  6 ---
 11 files changed, 56 insertions(+), 107 deletions(-)

diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h
index 7a8b844cf4..4bf97118ce 100644
--- a/lib/kernels/include/kernels/allocation.h
+++ b/lib/kernels/include/kernels/allocation.h
@@ -5,13 +5,10 @@
 #include <cstddef>
 #include <memory>
 
-enum class AllocLocation { HOST, DEVICE };
-
 namespace FlexFlow {
 
 struct IAllocator {
   virtual void *allocate(size_t) = 0;
-  virtual void *allocate_and_zero(size_t) = 0;
   virtual void deallocate(void *) = 0;
 
   virtual DeviceType get_allocation_device_type() const = 0;
@@ -25,7 +22,6 @@ struct Allocator {
   GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape);
 
   void *allocate(size_t mem_size);
-  void *allocate_and_zero(size_t mem_size);
   void deallocate(void *ptr);
 
   DeviceType get_allocation_device_type() const;
@@ -39,8 +35,6 @@ struct Allocator {
 
   Allocator(std::shared_ptr<IAllocator> ptr) : i_allocator(ptr){};
 
-  AllocLocation alloc_location;
-
 private:
   std::shared_ptr<IAllocator> i_allocator;
 };
diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h
index 4b89eb1411..26f347dd4c 100644
--- a/lib/kernels/include/kernels/batch_norm_kernels.h
+++ b/lib/kernels/include/kernels/batch_norm_kernels.h
@@ -7,8 +7,7 @@
 #include "kernels/ff_handle.h"
 #include <memory>
 
-namespace FlexFlow {
-namespace Kernels::BatchNorm {
+namespace ::FlexFlow::Kernels::BatchNorm;
 
 BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                     Allocator allocator,
@@ -28,8 +27,6 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      BatchNormPerDeviceState const &per_device_state,
-                     float const *input_ptr,
-                     float *output_grad_ptr,
                      float const *output_ptr,
                      float *output_grad_ptr,
                      float const *input_ptr,
@@ -47,7 +44,4 @@ void cleanup_kernel(Allocator allocator,
                     bool relu,
                     float *runningMean);
 
-} // namespace Kernels::BatchNorm
-} // namespace FlexFlow
-
 #endif
diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h
index c18d43683e..cf6cfe35d1 100644
--- a/lib/kernels/include/kernels/local_cpu_allocator.h
+++ b/lib/kernels/include/kernels/local_cpu_allocator.h
@@ -10,7 +10,6 @@ struct LocalCPUAllocator : public IAllocator {
   ~LocalCPUAllocator() = default;
 
   void *allocate(size_t) override;
-  void *allocate_and_zero(size_t) override;
   void deallocate(void *) override;
 
   DeviceType get_allocation_device_type() const override;
diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h
index fb3a42d864..b8e0540974 100644
--- a/lib/kernels/include/kernels/local_cuda_allocator.h
+++ b/lib/kernels/include/kernels/local_cuda_allocator.h
@@ -10,7 +10,6 @@ struct LocalCudaAllocator : public IAllocator {
   ~LocalCudaAllocator() override;
 
   void *allocate(size_t) override;
-  void *allocate_and_zero(size_t) override;
   void deallocate(void *) override;
 
   DeviceType get_allocation_device_type() const override;
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
index 255cf4b7e3..bed8daba51 100644
--- a/lib/kernels/src/allocation.cc
+++ b/lib/kernels/src/allocation.cc
@@ -7,10 +7,6 @@ void *Allocator::allocate(size_t mem_size) {
   return this->i_allocator->allocate(mem_size);
 }
 
-void *Allocator::allocate_and_zero(size_t mem_size) {
-  return this->i_allocator->allocate_and_zero(mem_size);
-}
-
 void Allocator::deallocate(void *ptr) {
   this->i_allocator->deallocate(ptr);
 }
diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc
index 666e5cae2e..416768a479 100644
--- a/lib/kernels/src/local_cuda_allocator.cc
+++ b/lib/kernels/src/local_cuda_allocator.cc
@@ -10,14 +10,6 @@ void *LocalCudaAllocator::allocate(size_t requested_memory_size) {
   return ptr;
 }
 
-void *LocalCudaAllocator::allocate_and_zero(size_t requested_memory_size) {
-  void *ptr;
-  checkCUDA(cudaMalloc(&ptr, requested_memory_size));
-  checkCUDA(cudaMemset(ptr, 0, requested_memory_size));
-  this->ptrs.insert(ptr);
-  return ptr;
-}
-
 void LocalCudaAllocator::deallocate(void *ptr) {
   if (contains(this->ptrs, ptr)) {
     checkCUDA(cudaFree(ptr));
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index af7f537189..1afa126870 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -94,44 +94,34 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     // Only calling forward kernel as backward kernel is exactly the same
     SUBCASE("forward_kernel") {
-      auto transform = [start_val = 1.1f,
-                        counter = 0.0f](float input) mutable -> float {
-        return start_val + counter++;
-      };
-
       // Run GPU Forward Kernel
       GenericTensorAccessorW input_accessor_gpu =
-          create_transformed_accessor_w<float, float>(
-              input_shape, gpu_allocator, transform);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           gpu_allocator);
       Kernels::Cast::forward_kernel(
           managed_stream.raw_stream(),
           read_only_accessor_from_write_accessor(input_accessor_gpu),
           output_accessor_gpu,
           DataType::FLOAT,
           DataType::INT32);
-      std::cout << "Before GPU load" << std::endl;
+
       std::vector<int32_t> result_data_gpu =
           load_accessor_data<DataType::INT32>(output_accessor_gpu);
 
       // Run CPU Forward Kernel
       GenericTensorAccessorW input_accessor_cpu =
-          create_transformed_accessor_w<float, float>(
-              input_shape, cpu_allocator, transform);
+          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
+                                                           cpu_allocator);
       Kernels::Cast::cpu_forward_kernel(
           read_only_accessor_from_write_accessor(input_accessor_cpu),
           output_accessor_cpu,
           DataType::FLOAT,
           DataType::INT32);
-      std::cout << "Before CPU load" << std::endl;
-      if (output_accessor_cpu.on_device) {
-        std::cout << "CPU data is on device" << std::endl;
-      } else {
-        std::cout << "CPU data is on host" << std::endl;
-      }
+
       std::vector<int32_t> result_data_cpu =
           load_accessor_data<DataType::INT32>(output_accessor_cpu);
 
-      CHECK(result_data_gpu == result_data_cpu);
+      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 9cd59464b9..902a5a7427 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -112,17 +112,12 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("Check Replicate Forward Kernel against CPU Kernel") {
-    std::size_t num_replicas = 10;
+    std::size_t num_replicas = 2;
 
-    // This should be like three shapes: pre_replication, replication shape, and
-    // reduced shape, but things are weird cause doesn't seem to be replicating
-    // anything (ie. input shape should be same as reduced shape)
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT);
-    TensorShape replicated_shape =
-        make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT);
-    TensorShape reduced_shape =
-        make_tensor_shape_from_legion_dims({10}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({5}, DataType::FLOAT);
+    TensorShape output_shape =
+        make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT);
 
     ManagedPerDeviceFFHandle managed_handle{};
     ManagedFFStream managed_stream{};
@@ -136,7 +131,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           create_random_filled_accessor_r<DataType::FLOAT>(input_shape,
                                                            gpu_allocator);
       GenericTensorAccessorW output_accessor_gpu =
-          gpu_allocator.allocate_tensor(replicated_shape);
+          gpu_allocator.allocate_tensor(output_shape);
+      fill_with_zeros(output_accessor_gpu);
 
       Kernels::Replicate::forward_kernel(
           managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
@@ -145,29 +141,29 @@ TEST_SUITE(FF_TEST_SUITE) {
           load_accessor_data<DataType::FLOAT>(output_accessor_gpu);
 
       // Run CPU Replicate Forward Kernel
-      GenericTensorAccessorW input_accessor_cpu =
-          copy_tensor_between_memories<DataType::FLOAT>(input_accessor_gpu,
-                                                        cpu_allocator);
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
       GenericTensorAccessorW output_accessor_cpu =
-          cpu_allocator.allocate_tensor(replicated_shape);
+          cpu_allocator.allocate_tensor(output_shape);
+      fill_with_zeros(output_accessor_cpu);
 
-      Kernels::Replicate::cpu_forward_kernel(
-          read_only_accessor_from_write_accessor(input_accessor_cpu),
-          output_accessor_cpu);
+      Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu,
+                                             output_accessor_cpu);
 
       std::vector<float> result_data_cpu =
           load_accessor_data<DataType::FLOAT>(output_accessor_cpu);
 
-      CHECK(result_data_gpu == result_data_cpu);
+      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
     }
 
     SUBCASE("backward_kernel") {
       // Run GPU Replicate Backward Kernel
       GenericTensorAccessorR output_grad_accessor_gpu =
-          create_random_filled_accessor_r<DataType::FLOAT>(replicated_shape,
+          create_random_filled_accessor_r<DataType::FLOAT>(output_shape,
                                                            gpu_allocator);
       GenericTensorAccessorW input_grad_accessor_gpu =
-          gpu_allocator.allocate_tensor_and_zero(reduced_shape);
+          gpu_allocator.allocate_tensor(input_shape);
+      fill_with_zeros(input_grad_accessor_gpu);
 
       Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
                                           input_grad_accessor_gpu,
@@ -178,21 +174,20 @@ TEST_SUITE(FF_TEST_SUITE) {
           load_accessor_data<DataType::FLOAT>(input_grad_accessor_gpu);
 
       // Run CPU Replicate Backward Kernel
-      GenericTensorAccessorW output_grad_accessor_cpu =
-          copy_tensor_between_memories<DataType::FLOAT>(
-              output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+
       GenericTensorAccessorW input_grad_accessor_cpu =
-          cpu_allocator.allocate_tensor_and_zero(reduced_shape);
+          cpu_allocator.allocate_tensor(input_shape);
+      fill_with_zeros(input_grad_accessor_cpu);
 
       Kernels::Replicate::cpu_backward_kernel(
-          input_grad_accessor_cpu,
-          read_only_accessor_from_write_accessor(output_grad_accessor_cpu),
-          num_replicas);
+          input_grad_accessor_cpu, output_grad_accessor_cpu, num_replicas);
 
       std::vector<float> result_data_cpu =
           load_accessor_data<DataType::FLOAT>(input_grad_accessor_cpu);
 
-      CHECK(result_data_gpu == result_data_cpu);
+      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index 503da33984..420a449cca 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -160,11 +160,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       };
 
       // Run GPU Cast Forward Kernel
-      GenericTensorAccessorW input_accessor_gpu =
-          create_transformed_accessor_w<float, float>(
-              input_shape, gpu_allocator, transform);
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r<DataType::FLOAT>(input_shape,
+                                                           gpu_allocator);
       GenericTensorAccessorW output_accessor_gpu =
           gpu_allocator.allocate_tensor(output_shape);
+      fill_with_zeros(output_accessor_gpu);
 
       Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
                                        input_accessor_gpu.get_float_ptr(),
@@ -178,33 +179,32 @@ TEST_SUITE(FF_TEST_SUITE) {
           load_accessor_data<DataType::FLOAT>(output_accessor_gpu);
 
       // Run CPU Cast Forward Kernel
-      GenericTensorAccessorW input_accessor_cpu =
-          create_transformed_accessor_w<float, float>(
-              input_shape, cpu_allocator, transform);
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
       GenericTensorAccessorW output_accessor_cpu =
           cpu_allocator.allocate_tensor(output_shape);
+      fill_with_zeros(output_accessor_cpu);
 
-      Kernels::Reverse::cpu_forward_kernel(
-          input_accessor_cpu.get_float_ptr(),
-          output_accessor_cpu.get_float_ptr(),
-          num_out_blks,
-          reverse_dim_size,
-          in_blk_size,
-          input_accessor_cpu.shape.num_elements());
+      Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu,
+                                           output_accessor_cpu,
+                                           num_out_blks,
+                                           reverse_dim_size,
+                                           in_blk_size);
 
       std::vector<float> result_data_cpu =
           load_accessor_data<DataType::FLOAT>(output_accessor_cpu);
 
-      CHECK(result_data_gpu == result_data_cpu);
+      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
     }
 
     SUBCASE("backward_kernel") {
       // Run GPU Cast Backward Kernel
-      GenericTensorAccessorW output_grad_accessor_gpu =
-          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r<DataType::FLOAT>(output_shape,
                                                            gpu_allocator);
       GenericTensorAccessorW input_grad_accessor_gpu =
           gpu_allocator.allocate_tensor(input_shape);
+      fill_with_zeros(input_grad_accessor_gpu);
 
       Kernels::Reverse::backward_kernel(
           managed_stream.raw_stream(),
@@ -219,25 +219,22 @@ TEST_SUITE(FF_TEST_SUITE) {
           load_accessor_data<DataType::FLOAT>(input_grad_accessor_gpu);
 
       // Run CPU Cast Backward Kernel
-      GenericTensorAccessorW output_grad_accessor_cpu =
-          copy_tensor_between_memories<DataType::FLOAT>(
-              read_only_accessor_from_write_accessor(output_grad_accessor_gpu),
-              cpu_allocator);
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
       GenericTensorAccessorW input_grad_accessor_cpu =
           cpu_allocator.allocate_tensor(input_shape);
+      fill_with_zeros(input_grad_accessor_cpu);
 
-      Kernels::Reverse::cpu_backward_kernel(
-          output_grad_accessor_cpu.get_float_ptr(),
-          input_grad_accessor_cpu.get_float_ptr(),
-          num_out_blks,
-          reverse_dim_size,
-          in_blk_size,
-          input_grad_accessor_cpu.shape.num_elements());
+      Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu,
+                                            input_grad_accessor_cpu,
+                                            num_out_blks,
+                                            reverse_dim_size,
+                                            in_blk_size);
 
       std::vector<float> result_data_cpu =
           load_accessor_data<DataType::FLOAT>(input_grad_accessor_cpu);
 
-      CHECK(result_data_gpu == result_data_cpu);
+      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
     }
   }
 }
diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h
index 31ca2475e2..f697337c52 100644
--- a/lib/local-execution/include/local-execution/tracked_allocator.h
+++ b/lib/local-execution/include/local-execution/tracked_allocator.h
@@ -12,7 +12,6 @@ struct TrackedAllocator : public IAllocator {
   ~TrackedAllocator() = default;
 
   void *allocate(size_t) override;
-  void *allocate_and_zero(size_t) override;
   void deallocate(void *) override;
 
   DeviceType get_allocation_device_type() const override;
diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc
index 7bce6ef304..ed181aea32 100644
--- a/lib/local-execution/src/tracked_allocator.cc
+++ b/lib/local-execution/src/tracked_allocator.cc
@@ -12,12 +12,6 @@ void *TrackedAllocator::allocate(size_t requested_memory_size) {
   return ptr;
 }
 
-void *TrackedAllocator::allocate_and_zero(size_t requested_memory_size) {
-  void *ptr = this->allocator.allocate_and_zero(requested_memory_size);
-  this->current_mem_usage += requested_memory_size;
-  return ptr;
-}
-
 void TrackedAllocator::deallocate(void *ptr) {
   size_t psize;
   this->ptr_mem_usage.erase(ptr);

From 74d186d2e2648097c77b7d1bdba9a1983ddf1736 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Mon, 14 Oct 2024 23:40:12 -0700
Subject: [PATCH 32/42] test_utils refactor and clarity

---
 lib/kernels/src/cuda/ops/reverse_kernels.cu   |  13 ++-
 lib/kernels/test/src/test_attention_kernel.cc |  26 ++---
 .../test/src/test_batch_matmul_kernel.cc      |  12 +--
 .../test/src/test_batch_norm_kernel.cc        |  15 +--
 lib/kernels/test/src/test_cast_kernel.cc      |  49 ---------
 lib/kernels/test/src/test_dropout.cc          |   6 +-
 lib/kernels/test/src/test_gather_kernels.cc   |   3 +-
 .../test/src/test_layer_norm_kernels.cc       |   3 +-
 lib/kernels/test/src/test_pool_2d_kernels.cc  |   6 +-
 lib/kernels/test/src/test_replicate_kernel.cc |  80 --------------
 lib/kernels/test/src/test_reverse_kernels.cc  | 102 +-----------------
 lib/kernels/test/src/test_softmax_kernel.cc   |   6 +-
 lib/kernels/test/src/test_split_kernel.cc     |   6 +-
 lib/kernels/test/src/test_transpose_kernel.cc |   3 +-
 lib/kernels/test/src/test_utils.cc            |  77 +++++++++++++
 15 files changed, 119 insertions(+), 288 deletions(-)

diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu
index c750819266..6469dfc735 100644
--- a/lib/kernels/src/cuda/ops/reverse_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu
@@ -26,6 +26,7 @@ namespace Reverse {
 //                                        coord_t reverse_dim_size,
 //                                        coord_t in_blk_size) {
 //   CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
+//     coord_t out_idx = i;
 //     coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
 //     i = i - blk_idx * (reverse_dim_size * in_blk_size);
 //     coord_t reverse_dim_idx = i / in_blk_size;
@@ -33,8 +34,18 @@ namespace Reverse {
 //     coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
 //                      (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size +
 //                      i;
-//     out_ptr[i] = in_ptr[in_idx];
+//     out_ptr[out_idx] = in_ptr[in_idx];
 //   }
+// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
+//   coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
+//   i = i - blk_idx * (reverse_dim_size * in_blk_size);
+//   coord_t reverse_dim_idx = i / in_blk_size;
+//   i = i - reverse_dim_idx * in_blk_size;
+//   coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
+//                    (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size +
+//                    i;
+//   out_ptr[i] = in_ptr[in_idx];
+// }
 // }
 
 /* I mentioned this earlier, but I still think the reverse_forward_kernel code
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index ea861c7da9..bd0167a677 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -53,16 +53,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         {nonnegative_int{state.weightSize}}, DataType::FLOAT);
 
     GenericTensorAccessorW query_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(query_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(query_shape, allocator);
     GenericTensorAccessorW key_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(key_shape, allocator);
+        create_random_filled_accessor_w(key_shape, allocator);
     GenericTensorAccessorW value_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(value_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(value_shape, allocator);
     GenericTensorAccessorW weight_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(weight_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(weight_shape, allocator);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW output_accessor =
@@ -82,20 +79,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW query_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(query_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(query_shape, allocator);
       GenericTensorAccessorW key_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(key_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(key_shape, allocator);
       GenericTensorAccessorW value_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(value_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(value_shape, allocator);
       GenericTensorAccessorW weight_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(weight_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(weight_shape, allocator);
       GenericTensorAccessorW output_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(output_shape, allocator);
 
       Kernels::MultiHeadAttention::backward_kernel(
           managed_stream.raw_stream(),
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index 63e0909b9a..d78d5daee5 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -29,14 +29,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT);
 
     GenericTensorAccessorW a_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(input_shape_a,
-                                                         allocator);
+        create_random_filled_accessor_w(input_shape_a, allocator);
     GenericTensorAccessorW b_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(input_shape_b,
-                                                         allocator);
+        create_random_filled_accessor_w(input_shape_b, allocator);
     GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(output_shape, allocator);
 
     SUBCASE("forward_kernel") {
       Kernels::BatchMatmul::forward_kernel(managed_stream.raw_stream(),
@@ -55,8 +52,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW o_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW a_grad_accessor =
           allocator.allocate_tensor(input_shape_a);
       GenericTensorAccessorW b_grad_accessor =
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index 79331a8539..d0ec2559ba 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -39,8 +39,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         {output_n, output_c, output_h, output_w}, DataType::FLOAT);
 
     GenericTensorAccessorW input_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(input_shape, allocator);
     GenericTensorAccessorW output_accessor =
         create_random_filled_accessor_w(output_shape, allocator);
     GenericTensorAccessorW scale_accessor = create_filled_accessor_w(
@@ -63,17 +62,13 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(input_shape, allocator);
       GenericTensorAccessorW scale_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(scale_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(scale_shape, allocator);
       GenericTensorAccessorW bias_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(bias_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(bias_shape, allocator);
 
       Kernels::BatchNorm::backward_kernel(
           /*stream=*/managed_stream.raw_stream(),
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index 1afa126870..2ac27a9747 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -75,53 +75,4 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
     }
   }
-
-  TEST_CASE("Check Cast Forward Kernel against CPU Kernel") {
-    ManagedFFStream managed_stream{};
-
-    Allocator gpu_allocator = create_local_cuda_memory_allocator();
-    Allocator cpu_allocator = create_local_cpu_memory_allocator();
-
-    TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT);
-    TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({100, 100}, DataType::INT32);
-
-    GenericTensorAccessorW output_accessor_gpu =
-        gpu_allocator.allocate_tensor(output_shape);
-    GenericTensorAccessorW output_accessor_cpu =
-        cpu_allocator.allocate_tensor(output_shape);
-
-    // Only calling forward kernel as backward kernel is exactly the same
-    SUBCASE("forward_kernel") {
-      // Run GPU Forward Kernel
-      GenericTensorAccessorW input_accessor_gpu =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           gpu_allocator);
-      Kernels::Cast::forward_kernel(
-          managed_stream.raw_stream(),
-          read_only_accessor_from_write_accessor(input_accessor_gpu),
-          output_accessor_gpu,
-          DataType::FLOAT,
-          DataType::INT32);
-
-      std::vector<int32_t> result_data_gpu =
-          load_accessor_data<DataType::INT32>(output_accessor_gpu);
-
-      // Run CPU Forward Kernel
-      GenericTensorAccessorW input_accessor_cpu =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           cpu_allocator);
-      Kernels::Cast::cpu_forward_kernel(
-          read_only_accessor_from_write_accessor(input_accessor_cpu),
-          output_accessor_cpu,
-          DataType::FLOAT,
-          DataType::INT32);
-
-      std::vector<int32_t> result_data_cpu =
-          load_accessor_data<DataType::INT32>(output_accessor_cpu);
-
-      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
-    }
-  }
 }
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 4bcb37f083..ad74fa7d36 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -47,11 +47,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_data =
-          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW input_grad_data =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(input_shape, allocator);
 
       Kernels::Dropout::backward_kernel(managed_stream.raw_stream(),
                                         state,
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 45005092fe..b75614588c 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -42,8 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(input_shape, allocator);
 
       Kernels::Gather::backward_kernel(managed_stream.raw_stream(),
                                        state,
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index cebf88986d..8368fe4efd 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -56,8 +56,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(input_shape, allocator);
       GenericTensorAccessorW gamma_grad_accessor =
           allocator.allocate_tensor(feature_shape);
       GenericTensorAccessorW beta_grad_accessor =
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index 74d178bd64..ff74f6fb28 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -55,11 +55,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         {output_w, output_h, output_c, output_n}, DataType::FLOAT);
 
     GenericTensorAccessorW input_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(input_shape, allocator);
     GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(output_shape, allocator);
 
     SUBCASE("forward_kernel") {
       Kernels::Pool2D::forward_kernel(managed_stream.raw_stream(),
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 902a5a7427..357d1958c0 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -110,84 +110,4 @@ TEST_SUITE(FF_TEST_SUITE) {
                                 input_grad_accessor_cpu));
     }
   }
-
-  TEST_CASE("Check Replicate Forward Kernel against CPU Kernel") {
-    std::size_t num_replicas = 2;
-
-    TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({5}, DataType::FLOAT);
-    TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT);
-
-    ManagedPerDeviceFFHandle managed_handle{};
-    ManagedFFStream managed_stream{};
-
-    Allocator gpu_allocator = create_local_cuda_memory_allocator();
-    Allocator cpu_allocator = create_local_cpu_memory_allocator();
-
-    SUBCASE("forward_kernel") {
-      // Run GPU Replicate Forward Kernel
-      GenericTensorAccessorR input_accessor_gpu =
-          create_random_filled_accessor_r<DataType::FLOAT>(input_shape,
-                                                           gpu_allocator);
-      GenericTensorAccessorW output_accessor_gpu =
-          gpu_allocator.allocate_tensor(output_shape);
-      fill_with_zeros(output_accessor_gpu);
-
-      Kernels::Replicate::forward_kernel(
-          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
-
-      std::vector<float> result_data_gpu =
-          load_accessor_data<DataType::FLOAT>(output_accessor_gpu);
-
-      // Run CPU Replicate Forward Kernel
-      GenericTensorAccessorR input_accessor_cpu =
-          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
-      GenericTensorAccessorW output_accessor_cpu =
-          cpu_allocator.allocate_tensor(output_shape);
-      fill_with_zeros(output_accessor_cpu);
-
-      Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu,
-                                             output_accessor_cpu);
-
-      std::vector<float> result_data_cpu =
-          load_accessor_data<DataType::FLOAT>(output_accessor_cpu);
-
-      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
-    }
-
-    SUBCASE("backward_kernel") {
-      // Run GPU Replicate Backward Kernel
-      GenericTensorAccessorR output_grad_accessor_gpu =
-          create_random_filled_accessor_r<DataType::FLOAT>(output_shape,
-                                                           gpu_allocator);
-      GenericTensorAccessorW input_grad_accessor_gpu =
-          gpu_allocator.allocate_tensor(input_shape);
-      fill_with_zeros(input_grad_accessor_gpu);
-
-      Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
-                                          input_grad_accessor_gpu,
-                                          output_grad_accessor_gpu,
-                                          num_replicas);
-
-      std::vector<float> result_data_gpu =
-          load_accessor_data<DataType::FLOAT>(input_grad_accessor_gpu);
-
-      // Run CPU Replicate Backward Kernel
-      GenericTensorAccessorR output_grad_accessor_cpu =
-          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
-
-      GenericTensorAccessorW input_grad_accessor_cpu =
-          cpu_allocator.allocate_tensor(input_shape);
-      fill_with_zeros(input_grad_accessor_cpu);
-
-      Kernels::Replicate::cpu_backward_kernel(
-          input_grad_accessor_cpu, output_grad_accessor_cpu, num_replicas);
-
-      std::vector<float> result_data_cpu =
-          load_accessor_data<DataType::FLOAT>(input_grad_accessor_cpu);
-
-      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
-    }
-  }
 }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index 420a449cca..bf23188a8f 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -43,8 +43,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
@@ -138,103 +137,4 @@ TEST_SUITE(FF_TEST_SUITE) {
                                 input_grad_accessor_cpu));
     }
   }
-
-  TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") {
-    std::size_t num_out_blks = 2;
-    std::size_t reverse_dim_size = 3;
-    std::size_t in_blk_size = 5;
-
-    TensorShape input_shape = make_tensor_shape_from_legion_dims(
-        {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT);
-    TensorShape output_shape = input_shape;
-
-    ManagedPerDeviceFFHandle managed_handle{};
-    ManagedFFStream managed_stream{};
-
-    Allocator gpu_allocator = create_local_cuda_memory_allocator();
-    Allocator cpu_allocator = create_local_cpu_memory_allocator();
-
-    SUBCASE("forward_kernel") {
-      auto transform = [counter = 0.0f](float val) mutable {
-        return counter++;
-      };
-
-      // Run GPU Cast Forward Kernel
-      GenericTensorAccessorR input_accessor_gpu =
-          create_random_filled_accessor_r<DataType::FLOAT>(input_shape,
-                                                           gpu_allocator);
-      GenericTensorAccessorW output_accessor_gpu =
-          gpu_allocator.allocate_tensor(output_shape);
-      fill_with_zeros(output_accessor_gpu);
-
-      Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
-                                       input_accessor_gpu.get_float_ptr(),
-                                       output_accessor_gpu.get_float_ptr(),
-                                       num_out_blks,
-                                       reverse_dim_size,
-                                       in_blk_size,
-                                       input_accessor_gpu.shape.num_elements());
-
-      std::vector<float> result_data_gpu =
-          load_accessor_data<DataType::FLOAT>(output_accessor_gpu);
-
-      // Run CPU Cast Forward Kernel
-      GenericTensorAccessorR input_accessor_cpu =
-          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
-      GenericTensorAccessorW output_accessor_cpu =
-          cpu_allocator.allocate_tensor(output_shape);
-      fill_with_zeros(output_accessor_cpu);
-
-      Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu,
-                                           output_accessor_cpu,
-                                           num_out_blks,
-                                           reverse_dim_size,
-                                           in_blk_size);
-
-      std::vector<float> result_data_cpu =
-          load_accessor_data<DataType::FLOAT>(output_accessor_cpu);
-
-      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
-    }
-
-    SUBCASE("backward_kernel") {
-      // Run GPU Cast Backward Kernel
-      GenericTensorAccessorR output_grad_accessor_gpu =
-          create_random_filled_accessor_r<DataType::FLOAT>(output_shape,
-                                                           gpu_allocator);
-      GenericTensorAccessorW input_grad_accessor_gpu =
-          gpu_allocator.allocate_tensor(input_shape);
-      fill_with_zeros(input_grad_accessor_gpu);
-
-      Kernels::Reverse::backward_kernel(
-          managed_stream.raw_stream(),
-          output_grad_accessor_gpu.get_float_ptr(),
-          input_grad_accessor_gpu.get_float_ptr(),
-          num_out_blks,
-          reverse_dim_size,
-          in_blk_size,
-          input_grad_accessor_gpu.shape.num_elements());
-
-      std::vector<float> result_data_gpu =
-          load_accessor_data<DataType::FLOAT>(input_grad_accessor_gpu);
-
-      // Run CPU Cast Backward Kernel
-      GenericTensorAccessorR output_grad_accessor_cpu =
-          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
-      GenericTensorAccessorW input_grad_accessor_cpu =
-          cpu_allocator.allocate_tensor(input_shape);
-      fill_with_zeros(input_grad_accessor_cpu);
-
-      Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu,
-                                            input_grad_accessor_cpu,
-                                            num_out_blks,
-                                            reverse_dim_size,
-                                            in_blk_size);
-
-      std::vector<float> result_data_cpu =
-          load_accessor_data<DataType::FLOAT>(input_grad_accessor_cpu);
-
-      CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu));
-    }
-  }
 }
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index 7e6e95daaf..d4fb496f7b 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -32,13 +32,11 @@ TEST_SUITE(FF_TEST_SUITE) {
                                       input_w.unwrap_nonnegative());
 
     GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                         allocator);
+        create_random_filled_accessor_w(output_shape, allocator);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(input_shape, allocator);
 
       Kernels::Softmax::forward_kernel(managed_stream.raw_stream(),
                                        state,
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index 4d3b948714..d98f88a30e 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -27,8 +27,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(input_shape, allocator);
 
       std::vector<float *> output_ptrs = repeat(num_outputs, [&]() {
         GenericTensorAccessorW output_accessor =
@@ -49,8 +48,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<float *> output_grad_ptrs(num_outputs.unwrap_nonnegative());
       for (int i = 0; i < num_outputs; i++) {
         GenericTensorAccessorW output_grad_accessor =
-            create_random_filled_accessor_w<DataType::FLOAT>(output_shape,
-                                                             allocator);
+            create_random_filled_accessor_w(output_shape, allocator);
         output_grad_ptrs[i] = output_grad_accessor.get_float_ptr();
       }
 
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index c35961b739..cac43c6ff3 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -39,8 +39,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w<DataType::FLOAT>(input_shape,
-                                                           allocator);
+          create_random_filled_accessor_w(input_shape, allocator);
 
       Kernels::Transpose::backward_kernel(managed_stream.raw_stream(),
                                           attrs,
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
index 70cca5f2f0..015918b8a5 100644
--- a/lib/kernels/test/src/test_utils.cc
+++ b/lib/kernels/test/src/test_utils.cc
@@ -110,6 +110,83 @@ struct CPUAccessorRContainsNonZero {
   }
 };
 
+bool contains_non_zero(GenericTensorAccessorR const &accessor) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR cpu_accessor =
+      create_cpu_compatible_accessor_r(accessor, cpu_allocator);
+  return DataTypeDispatch1<CPUAccessorRContainsNonZero>{}(
+      cpu_accessor.data_type, cpu_accessor);
+}
+
+bool contains_non_zero(GenericTensorAccessorW const &accessor) {
+  GenericTensorAccessorR r_accessor =
+      read_only_accessor_from_write_accessor(accessor);
+  return contains_non_zero(r_accessor);
+}
+
+GenericTensorAccessorR
+    create_cpu_compatible_accessor_r(GenericTensorAccessorR const &accessor,
+                                     Allocator &cpu_allocator) {
+  GenericTensorAccessorR cpu_accessor = accessor;
+  if (accessor.device_type == DeviceType::GPU) {
+    cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator);
+  }
+  return cpu_accessor;
+}
+
+GenericTensorAccessorW
+    create_cpu_compatible_accessor_w(GenericTensorAccessorW const &accessor,
+                                     Allocator &cpu_allocator) {
+  GenericTensorAccessorW cpu_accessor = accessor;
+  if (accessor.device_type == DeviceType::GPU) {
+    cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator);
+  }
+  return cpu_accessor;
+}
+
+template <DataType DT>
+struct PrintCPUAccessorR {
+  void operator()(GenericTensorAccessorR const &accessor) {
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = accessor.get<DT>();
+    for (size_t i = 0; i < accessor.shape.num_elements(); i++) {
+      std::cout << data_ptr[i] << " ";
+    }
+    std::cout << "\n";
+  }
+};
+
+void print_accessor(GenericTensorAccessorR const &accessor) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR cpu_accessor =
+      create_cpu_compatible_accessor_r(accessor, cpu_allocator);
+  DataTypeDispatch1<PrintCPUAccessorR>{}(accessor.data_type, accessor);
+}
+
+void print_accessor(GenericTensorAccessorW const &accessor) {
+  GenericTensorAccessorR r_accessor =
+      read_only_accessor_from_write_accessor(accessor);
+  print_accessor(r_accessor);
+}
+
+template <DataType DT>
+struct CPUAccessorRContainsNonZero {
+  bool operator()(GenericTensorAccessorR const &accessor) {
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = accessor.get<DT>();
+
+    for (size_t i = 0; i < accessor.shape.num_elements(); i++) {
+      if (data_ptr[i] != 0) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+};
+
 bool contains_non_zero(GenericTensorAccessorR const &accessor) {
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
   GenericTensorAccessorR cpu_accessor =

From f95d9da981b8c5b222c5ca983b9ae0a687b32b68 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Mon, 4 Nov 2024 23:12:02 -0800
Subject: [PATCH 33/42] R & W accessor changes, minimize code bloat

---
 .../test/src/test_managed_ff_stream.cc        |  5 ++
 .../src/test_managed_per_device_ff_handle.cc  |  9 +--
 lib/kernels/test/src/test_utils.cc            | 77 -------------------
 3 files changed, 9 insertions(+), 82 deletions(-)

diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
index 605aa6ffa1..3535dd258c 100644
--- a/lib/kernels/test/src/test_managed_ff_stream.cc
+++ b/lib/kernels/test/src/test_managed_ff_stream.cc
@@ -27,5 +27,10 @@ TEST_SUITE(FF_TEST_SUITE) {
         CHECK(&base_stream.raw_stream() == base_stream_ptr);
       }
     }
+
+    SUBCASE("Test Self-Assignment") {
+      base_stream = std::move(base_stream);
+      CHECK(&base_stream.raw_stream() == base_stream_ptr);
+    }
   }
 }
diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
index d081a0b07c..b22c683205 100644
--- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
+++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
@@ -28,14 +28,13 @@ TEST_SUITE(FF_TEST_SUITE) {
             /*allowTensorOpMathConversion=*/true};
         new_handle = std::move(base_handle);
 
-        CHECK(&base_handle.raw_handle() == nullptr);
-        CHECK(&new_handle.raw_handle() == base_handle_ptr);
-      }
+      CHECK(&base_handle.raw_handle() == nullptr);
+      CHECK(&new_handle.raw_handle() == base_handle_ptr);
+    }
 
-      SUBCASE("move assign to self") {
+    SUBCASE("move assign to self") {
         base_handle = std::move(base_handle);
         CHECK(&base_handle.raw_handle() == base_handle_ptr);
-      }
     }
   }
 }
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
index 015918b8a5..70cca5f2f0 100644
--- a/lib/kernels/test/src/test_utils.cc
+++ b/lib/kernels/test/src/test_utils.cc
@@ -110,83 +110,6 @@ struct CPUAccessorRContainsNonZero {
   }
 };
 
-bool contains_non_zero(GenericTensorAccessorR const &accessor) {
-  Allocator cpu_allocator = create_local_cpu_memory_allocator();
-  GenericTensorAccessorR cpu_accessor =
-      create_cpu_compatible_accessor_r(accessor, cpu_allocator);
-  return DataTypeDispatch1<CPUAccessorRContainsNonZero>{}(
-      cpu_accessor.data_type, cpu_accessor);
-}
-
-bool contains_non_zero(GenericTensorAccessorW const &accessor) {
-  GenericTensorAccessorR r_accessor =
-      read_only_accessor_from_write_accessor(accessor);
-  return contains_non_zero(r_accessor);
-}
-
-GenericTensorAccessorR
-    create_cpu_compatible_accessor_r(GenericTensorAccessorR const &accessor,
-                                     Allocator &cpu_allocator) {
-  GenericTensorAccessorR cpu_accessor = accessor;
-  if (accessor.device_type == DeviceType::GPU) {
-    cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator);
-  }
-  return cpu_accessor;
-}
-
-GenericTensorAccessorW
-    create_cpu_compatible_accessor_w(GenericTensorAccessorW const &accessor,
-                                     Allocator &cpu_allocator) {
-  GenericTensorAccessorW cpu_accessor = accessor;
-  if (accessor.device_type == DeviceType::GPU) {
-    cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator);
-  }
-  return cpu_accessor;
-}
-
-template <DataType DT>
-struct PrintCPUAccessorR {
-  void operator()(GenericTensorAccessorR const &accessor) {
-    using T = real_type_t<DT>;
-
-    T const *data_ptr = accessor.get<DT>();
-    for (size_t i = 0; i < accessor.shape.num_elements(); i++) {
-      std::cout << data_ptr[i] << " ";
-    }
-    std::cout << "\n";
-  }
-};
-
-void print_accessor(GenericTensorAccessorR const &accessor) {
-  Allocator cpu_allocator = create_local_cpu_memory_allocator();
-  GenericTensorAccessorR cpu_accessor =
-      create_cpu_compatible_accessor_r(accessor, cpu_allocator);
-  DataTypeDispatch1<PrintCPUAccessorR>{}(accessor.data_type, accessor);
-}
-
-void print_accessor(GenericTensorAccessorW const &accessor) {
-  GenericTensorAccessorR r_accessor =
-      read_only_accessor_from_write_accessor(accessor);
-  print_accessor(r_accessor);
-}
-
-template <DataType DT>
-struct CPUAccessorRContainsNonZero {
-  bool operator()(GenericTensorAccessorR const &accessor) {
-    using T = real_type_t<DT>;
-
-    T const *data_ptr = accessor.get<DT>();
-
-    for (size_t i = 0; i < accessor.shape.num_elements(); i++) {
-      if (data_ptr[i] != 0) {
-        return true;
-      }
-    }
-
-    return false;
-  }
-};
-
 bool contains_non_zero(GenericTensorAccessorR const &accessor) {
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
   GenericTensorAccessorR cpu_accessor =

From 8c8bc75c03412614cce37e73e3808d0859bbb178 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Thu, 21 Nov 2024 22:16:51 -0800
Subject: [PATCH 34/42] issue #1502 & issue #1540

---
 .../include/kernels/loss_function_kernels.h   |  2 +-
 lib/kernels/include/kernels/pool_2d_kernels.h |  2 +-
 lib/pcg/include/pcg/metric.h                  | 73 +++++++++++++++++++
 3 files changed, 75 insertions(+), 2 deletions(-)
 create mode 100644 lib/pcg/include/pcg/metric.h

diff --git a/lib/kernels/include/kernels/loss_function_kernels.h b/lib/kernels/include/kernels/loss_function_kernels.h
index bab404f884..9e0dbd4ba1 100644
--- a/lib/kernels/include/kernels/loss_function_kernels.h
+++ b/lib/kernels/include/kernels/loss_function_kernels.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H
 
-#include "kernels/device.h"
+#include "device.h"
 
 namespace FlexFlow {
 
diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h
index 9650859a18..ad0a52efb9 100644
--- a/lib/kernels/include/kernels/pool_2d_kernels.h
+++ b/lib/kernels/include/kernels/pool_2d_kernels.h
@@ -67,7 +67,7 @@ void forward_kernel(ffStream_t stream,
                     void const *input_ptr,
                     void *output_ptr);
 
-void backward_kernel(ffStream_t stream,
+void backward_kernel(cudaStream_t stream,
                      Pool2DPerDeviceState const &m,
                      void const *output_ptr,
                      void const *output_grad_ptr,
diff --git a/lib/pcg/include/pcg/metric.h b/lib/pcg/include/pcg/metric.h
new file mode 100644
index 0000000000..f56078772e
--- /dev/null
+++ b/lib/pcg/include/pcg/metric.h
@@ -0,0 +1,73 @@
+#ifndef _FF_METRICS_H_
+#define _FF_METRICS_H_
+
+#include <unordered_set>
+#include "utils/fmt.h"
+#include "op-attrs/ops/loss_functions/loss_functions.h"
+
+namespace FlexFlow {
+
+enum class Metric {
+  ACCURACY,
+  CATEGORICAL_CROSSENTROPY,
+  SPARSE_CATEGORICAL_CROSSENTROPY,
+  MEAN_SQUARED_ERROR,
+  ROOT_MEAN_SQUARED_ERROR,
+  MEAN_ABSOLUTE_ERROR,
+};
+
+class MetricsAttrs {
+public:
+  MetricsAttrs() = delete;
+  MetricsAttrs(LossFunction, std::vector<Metric> const &);
+
+public:
+  LossFunction loss_type;
+  bool measure_accuracy;
+  bool measure_categorical_crossentropy;
+  bool measure_sparse_categorical_crossentropy;
+  bool measure_mean_squared_error;
+  bool measure_root_mean_squared_error;
+  bool measure_mean_absolute_error;
+};
+
+} // namespace FlexFlow
+
+namespace fmt {
+
+template <>
+struct formatter<::FlexFlow::Metric> : formatter<string_view> {
+  template <typename FormatContext>
+  auto format(::FlexFlow::Metric m, FormatContext &ctx) const
+      -> decltype(ctx.out()) {
+    using namespace FlexFlow;
+
+    string_view name = "unknown";
+    switch (m) {
+      case Metric::ACCURACY:
+        name = "Accuracy";
+        break;
+      case Metric::CATEGORICAL_CROSSENTROPY:
+        name = "CategoricalCrossEntropy";
+        break;
+      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
+        name = "SparseCategoricalCrossEntropy";
+        break;
+      case Metric::MEAN_SQUARED_ERROR:
+        name = "MeanSquaredError";
+        break;
+      case Metric::ROOT_MEAN_SQUARED_ERROR:
+        name = "RootMeanSquaredError";
+        break;
+      case Metric::MEAN_ABSOLUTE_ERROR:
+        name = "MeanAbsoluteError";
+        break;
+    }
+    return formatter<string_view>::format(name, ctx);
+  }
+};
+
+} // namespace fmt
+
+
+#endif

From c00ab840d1ae9a091368ea1440d5133b91bf0ea1 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Mon, 27 Jan 2025 20:57:10 -0800
Subject: [PATCH 35/42] branch merge and test fixes

---
 lib/kernels/include/kernels/loss_function_kernels.h | 2 +-
 lib/kernels/include/kernels/pool_2d_kernels.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/kernels/include/kernels/loss_function_kernels.h b/lib/kernels/include/kernels/loss_function_kernels.h
index 9e0dbd4ba1..bab404f884 100644
--- a/lib/kernels/include/kernels/loss_function_kernels.h
+++ b/lib/kernels/include/kernels/loss_function_kernels.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h
index ad0a52efb9..9650859a18 100644
--- a/lib/kernels/include/kernels/pool_2d_kernels.h
+++ b/lib/kernels/include/kernels/pool_2d_kernels.h
@@ -67,7 +67,7 @@ void forward_kernel(ffStream_t stream,
                     void const *input_ptr,
                     void *output_ptr);
 
-void backward_kernel(cudaStream_t stream,
+void backward_kernel(ffStream_t stream,
                      Pool2DPerDeviceState const &m,
                      void const *output_ptr,
                      void const *output_grad_ptr,

From bc4b6592346306f27665c3dc7c31c306b5b14825 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Wed, 5 Feb 2025 01:30:08 -0800
Subject: [PATCH 36/42] merge

---
 lib/pcg/include/pcg/metric.h                  | 73 -------------------
 .../src/pcg/{metric.cc => metric_attrs.cc}    |  0
 2 files changed, 73 deletions(-)
 delete mode 100644 lib/pcg/include/pcg/metric.h
 rename lib/pcg/src/pcg/{metric.cc => metric_attrs.cc} (100%)

diff --git a/lib/pcg/include/pcg/metric.h b/lib/pcg/include/pcg/metric.h
deleted file mode 100644
index f56078772e..0000000000
--- a/lib/pcg/include/pcg/metric.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef _FF_METRICS_H_
-#define _FF_METRICS_H_
-
-#include <unordered_set>
-#include "utils/fmt.h"
-#include "op-attrs/ops/loss_functions/loss_functions.h"
-
-namespace FlexFlow {
-
-enum class Metric {
-  ACCURACY,
-  CATEGORICAL_CROSSENTROPY,
-  SPARSE_CATEGORICAL_CROSSENTROPY,
-  MEAN_SQUARED_ERROR,
-  ROOT_MEAN_SQUARED_ERROR,
-  MEAN_ABSOLUTE_ERROR,
-};
-
-class MetricsAttrs {
-public:
-  MetricsAttrs() = delete;
-  MetricsAttrs(LossFunction, std::vector<Metric> const &);
-
-public:
-  LossFunction loss_type;
-  bool measure_accuracy;
-  bool measure_categorical_crossentropy;
-  bool measure_sparse_categorical_crossentropy;
-  bool measure_mean_squared_error;
-  bool measure_root_mean_squared_error;
-  bool measure_mean_absolute_error;
-};
-
-} // namespace FlexFlow
-
-namespace fmt {
-
-template <>
-struct formatter<::FlexFlow::Metric> : formatter<string_view> {
-  template <typename FormatContext>
-  auto format(::FlexFlow::Metric m, FormatContext &ctx) const
-      -> decltype(ctx.out()) {
-    using namespace FlexFlow;
-
-    string_view name = "unknown";
-    switch (m) {
-      case Metric::ACCURACY:
-        name = "Accuracy";
-        break;
-      case Metric::CATEGORICAL_CROSSENTROPY:
-        name = "CategoricalCrossEntropy";
-        break;
-      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
-        name = "SparseCategoricalCrossEntropy";
-        break;
-      case Metric::MEAN_SQUARED_ERROR:
-        name = "MeanSquaredError";
-        break;
-      case Metric::ROOT_MEAN_SQUARED_ERROR:
-        name = "RootMeanSquaredError";
-        break;
-      case Metric::MEAN_ABSOLUTE_ERROR:
-        name = "MeanAbsoluteError";
-        break;
-    }
-    return formatter<string_view>::format(name, ctx);
-  }
-};
-
-} // namespace fmt
-
-
-#endif
diff --git a/lib/pcg/src/pcg/metric.cc b/lib/pcg/src/pcg/metric_attrs.cc
similarity index 100%
rename from lib/pcg/src/pcg/metric.cc
rename to lib/pcg/src/pcg/metric_attrs.cc

From e71b6d749cadff517a7b5e35cf69581d99474125 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Wed, 5 Feb 2025 05:19:37 -0800
Subject: [PATCH 37/42] build after merge

---
 lib/kernels/include/kernels/accessor.h        |   7 +-
 .../include/kernels/batch_norm_kernels.h      |   3 +-
 lib/kernels/src/accessor.cc                   |   5 +-
 lib/kernels/src/cpu/cast_kernels.cc           |   4 +-
 lib/kernels/src/cpu/combine_kernels.cc        |   5 +-
 lib/kernels/src/cpu/replicate_kernels.cc      |   5 +-
 lib/kernels/src/cpu/reverse_kernels.cc        |   7 +-
 lib/kernels/src/cuda/embedding_kernels.cu     | 214 +++++++++---------
 lib/kernels/src/cuda/optimizer_kernels.cu     |   6 +-
 lib/kernels/test/src/test_cast_kernel.cc      |   4 +-
 lib/kernels/test/src/test_combine_kernel.cc   |   2 +-
 lib/kernels/test/src/test_dropout.cc          |   4 -
 .../src/test_managed_per_device_ff_handle.cc  |   9 +-
 lib/kernels/test/src/test_reduction_kernel.cc |   1 -
 lib/kernels/test/src/test_replicate_kernel.cc |  12 +-
 lib/kernels/test/src/test_reverse_kernels.cc  |  15 +-
 lib/kernels/test/src/test_utils.cc            |  26 ++-
 lib/kernels/test/src/test_utils.h             |   9 +-
 18 files changed, 178 insertions(+), 160 deletions(-)

diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index a6fc4129e0..52ca62e217 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -17,17 +17,18 @@ inline int calculate_accessor_offset(std::vector<int> const &indices,
   int multiplier = 1;
 
   for (int i = 0; i < shape.num_dims(); i++) {
-    if (indices.at(i) >= shape.at(legion_dim_t{i})) {
+    if (indices.at(i) >= shape.at(legion_dim_t{nonnegative_int{i}})) {
       throw mk_runtime_error(
           fmt::format("In {} dimension, attempting to access index {} "
                       "when only {} indexes exist",
                       i,
                       indices.at(i),
-                      shape.at(legion_dim_t{i})));
+                      shape.at(legion_dim_t{nonnegative_int{i}})));
     }
 
     offset += indices.at(i) * multiplier;
-    multiplier *= shape.at(legion_dim_t{i});
+    multiplier *=
+        shape.at(legion_dim_t{nonnegative_int{i}}).unwrap_nonnegative();
   }
 
   return offset;
diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h
index 26f347dd4c..90202592a7 100644
--- a/lib/kernels/include/kernels/batch_norm_kernels.h
+++ b/lib/kernels/include/kernels/batch_norm_kernels.h
@@ -7,7 +7,7 @@
 #include "kernels/ff_handle.h"
 #include <memory>
 
-namespace ::FlexFlow::Kernels::BatchNorm;
+namespace FlexFlow::Kernels::BatchNorm {
 
 BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                     Allocator allocator,
@@ -44,4 +44,5 @@ void cleanup_kernel(Allocator allocator,
                     bool relu,
                     float *runningMean);
 
+} // namespace FlexFlow::Kernels::BatchNorm
 #endif
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
index e56bded737..1a0abec1c5 100644
--- a/lib/kernels/src/accessor.cc
+++ b/lib/kernels/src/accessor.cc
@@ -7,8 +7,9 @@ namespace FlexFlow {
 void copy_accessor_data_to_l_from_r(
     GenericTensorAccessorW &dst_accessor,
     GenericTensorAccessorR const &src_accessor) {
-  size_t num_bytes = dst_accessor.shape.get_volume() *
-                     size_of_datatype(dst_accessor.data_type);
+  size_t num_bytes =
+      dst_accessor.shape.get_volume().unwrap_nonnegative() *
+      size_of_datatype(dst_accessor.data_type).unwrap_nonnegative();
 
   DeviceType dst_device_type = dst_accessor.device_type;
   DeviceType src_device_type = src_accessor.device_type;
diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc
index 08f5552afc..cdd57b8947 100644
--- a/lib/kernels/src/cpu/cast_kernels.cc
+++ b/lib/kernels/src/cpu/cast_kernels.cc
@@ -21,7 +21,7 @@ template <DataType IDT, DataType ODT>
 struct CPUForwardKernel {
   void operator()(GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-    size_t volume = input.shape.get_volume();
+    size_t volume = input.shape.get_volume().unwrap_nonnegative();
     cpu_cast_forward(input.get<IDT>(), output.get<ODT>(), volume);
   }
 };
@@ -30,7 +30,7 @@ template <DataType IDT, DataType ODT>
 struct CPUBackwardKernel {
   void operator()(GenericTensorAccessorR const &output,
                   GenericTensorAccessorW const &input) {
-    size_t volume = output.shape.get_volume();
+    size_t volume = output.shape.get_volume().unwrap_nonnegative();
     cpu_cast_backward(
         output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
   }
diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc
index d0be1f9f2d..577984f21a 100644
--- a/lib/kernels/src/cpu/combine_kernels.cc
+++ b/lib/kernels/src/cpu/combine_kernels.cc
@@ -9,7 +9,8 @@ struct CPUForwardKernel {
                   GenericTensorAccessorW const &output) {
     memcpy(output.get<DT>(),
            input.get<DT>(),
-           input.shape.get_volume() * size_of_datatype(DT));
+           input.shape.get_volume().unwrap_nonnegative() *
+               size_of_datatype(DT).unwrap_nonnegative());
   }
 };
 
@@ -17,7 +18,7 @@ template <DataType DT>
 struct CPUBackwardKernel {
   void operator()(GenericTensorAccessorR const &output_grad,
                   GenericTensorAccessorW const &input_grad) {
-    size_t num_elements = output_grad.shape.get_volume();
+    size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative();
     for (int i = 0; i < num_elements; ++i) {
       input_grad.get<DT>()[i] += output_grad.get<DT>()[i];
     }
diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc
index cfcb44dac5..1e50cad4b4 100644
--- a/lib/kernels/src/cpu/replicate_kernels.cc
+++ b/lib/kernels/src/cpu/replicate_kernels.cc
@@ -9,7 +9,8 @@ struct CPUForwardKernel {
                   GenericTensorAccessorW &output) {
     memcpy(output.get<DT>(),
            input.get<DT>(),
-           input.shape.num_elements() * size_of_datatype(DT));
+           input.shape.num_elements().unwrap_nonnegative() *
+               size_of_datatype(DT).unwrap_nonnegative());
   }
 };
 
@@ -19,7 +20,7 @@ struct CPUBackwardKernel {
                   GenericTensorAccessorW &input,
                   size_t num_replicas) {
     using T = real_type_t<DT>;
-    for (int i = 0; i < input.shape.num_elements(); i++) {
+    for (int i = 0; i < input.shape.num_elements().unwrap_nonnegative(); i++) {
       T cur_sum = 0;
       for (int j = 0; j < num_replicas; j++) {
         cur_sum += output.at<DT>({i, j});
diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc
index bc73c80e9e..848094cda7 100644
--- a/lib/kernels/src/cpu/reverse_kernels.cc
+++ b/lib/kernels/src/cpu/reverse_kernels.cc
@@ -11,9 +11,10 @@ struct CPUReverseForwardKernel {
                   GenericTensorAccessorW &output) {
     assert(input.data_type == DT && output.data_type == DT);
 
-    int num_out_blocks = input.shape.at(legion_dim_t(0));
-    int reverse_dim_size = input.shape.at(legion_dim_t(1));
-    int in_block_size = input.shape.at(legion_dim_t(2));
+    int num_out_blocks = input.shape.at(legion_dim_t(0_n)).unwrap_nonnegative();
+    int reverse_dim_size =
+        input.shape.at(legion_dim_t(1_n)).unwrap_nonnegative();
+    int in_block_size = input.shape.at(legion_dim_t(2_n)).unwrap_nonnegative();
 
     for (int block_idx = 0; block_idx < num_out_blocks; block_idx++) {
       for (int rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) {
diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu
index c83e9f0a94..7ea1d3b9d3 100644
--- a/lib/kernels/src/cuda/embedding_kernels.cu
+++ b/lib/kernels/src/cuda/embedding_kernels.cu
@@ -342,26 +342,28 @@ struct ForwardKernel<DataType::INT32, DataType::FLOAT> {
                   int out_dim,
                   int batch_size) {
     if (!aggr.has_value()) {
-      embed_forward_no_aggr<float><<<GET_BLOCKS(output.shape.get_volume()),
-                                     CUDA_NUM_THREADS,
-                                     0,
-                                     stream>>>(input.get<DataType::INT32>(),
-                                               output.get<DataType::FLOAT>(),
-                                               weight.get<DataType::FLOAT>(),
-                                               out_dim,
-                                               batch_size);
+      embed_forward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
-      embed_forward_with_aggr<float><<<GET_BLOCKS(output.shape.get_volume()),
-                                       CUDA_NUM_THREADS,
-                                       0,
-                                       stream>>>(input.get<DataType::INT32>(),
-                                                 output.get<DataType::FLOAT>(),
-                                                 weight.get<DataType::FLOAT>(),
-                                                 out_dim,
-                                                 in_dim,
-                                                 batch_size,
-                                                 aggr.value());
+      embed_forward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
     }
   }
 };
@@ -377,26 +379,28 @@ struct ForwardKernel<DataType::INT32, DataType::HALF> {
                   int out_dim,
                   int batch_size) {
     if (!aggr.has_value()) {
-      embed_forward_no_aggr<half><<<GET_BLOCKS(output.shape.get_volume()),
-                                    CUDA_NUM_THREADS,
-                                    0,
-                                    stream>>>(input.get<DataType::INT32>(),
-                                              output.get<DataType::HALF>(),
-                                              weight.get<DataType::HALF>(),
-                                              out_dim,
-                                              batch_size);
+      embed_forward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
+                       out_dim,
+                       batch_size);
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
-      embed_forward_with_aggr<half><<<GET_BLOCKS(output.shape.get_volume()),
-                                      CUDA_NUM_THREADS,
-                                      0,
-                                      stream>>>(input.get<DataType::INT32>(),
-                                                output.get<DataType::HALF>(),
-                                                weight.get<DataType::HALF>(),
-                                                out_dim,
-                                                in_dim,
-                                                batch_size,
-                                                aggr.value());
+      embed_forward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
     }
   }
 };
@@ -412,18 +416,19 @@ struct ForwardKernel<DataType::INT32, DataType::DOUBLE> {
                   int out_dim,
                   int batch_size) {
     if (!aggr.has_value()) {
-      embed_forward_no_aggr<double><<<GET_BLOCKS(output.shape.get_volume()),
-                                      CUDA_NUM_THREADS,
-                                      0,
-                                      stream>>>(input.get<DataType::INT32>(),
-                                                output.get<DataType::DOUBLE>(),
-                                                weight.get<DataType::DOUBLE>(),
-                                                out_dim,
-                                                batch_size);
+      embed_forward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
       embed_forward_with_aggr<double>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -448,26 +453,28 @@ struct ForwardKernel<DataType::INT64, DataType::FLOAT> {
                   int out_dim,
                   int batch_size) {
     if (!aggr.has_value()) {
-      embed_forward_no_aggr<float><<<GET_BLOCKS(output.shape.get_volume()),
-                                     CUDA_NUM_THREADS,
-                                     0,
-                                     stream>>>(input.get<DataType::INT64>(),
-                                               output.get<DataType::FLOAT>(),
-                                               weight.get<DataType::FLOAT>(),
-                                               out_dim,
-                                               batch_size);
+      embed_forward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
-      embed_forward_with_aggr<float><<<GET_BLOCKS(output.shape.get_volume()),
-                                       CUDA_NUM_THREADS,
-                                       0,
-                                       stream>>>(input.get<DataType::INT64>(),
-                                                 output.get<DataType::FLOAT>(),
-                                                 weight.get<DataType::FLOAT>(),
-                                                 out_dim,
-                                                 in_dim,
-                                                 batch_size,
-                                                 aggr.value());
+      embed_forward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
     }
   }
 };
@@ -483,26 +490,28 @@ struct ForwardKernel<DataType::INT64, DataType::HALF> {
                   int out_dim,
                   int batch_size) {
     if (!aggr.has_value()) {
-      embed_forward_no_aggr<half><<<GET_BLOCKS(output.shape.get_volume()),
-                                    CUDA_NUM_THREADS,
-                                    0,
-                                    stream>>>(input.get<DataType::INT64>(),
-                                              output.get<DataType::HALF>(),
-                                              weight.get<DataType::HALF>(),
-                                              out_dim,
-                                              batch_size);
+      embed_forward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
+                       out_dim,
+                       batch_size);
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
-      embed_forward_with_aggr<half><<<GET_BLOCKS(output.shape.get_volume()),
-                                      CUDA_NUM_THREADS,
-                                      0,
-                                      stream>>>(input.get<DataType::INT64>(),
-                                                output.get<DataType::HALF>(),
-                                                weight.get<DataType::HALF>(),
-                                                out_dim,
-                                                in_dim,
-                                                batch_size,
-                                                aggr.value());
+      embed_forward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
     }
   }
 };
@@ -518,18 +527,19 @@ struct ForwardKernel<DataType::INT64, DataType::DOUBLE> {
                   int out_dim,
                   int batch_size) {
     if (!aggr.has_value()) {
-      embed_forward_no_aggr<double><<<GET_BLOCKS(output.shape.get_volume()),
-                                      CUDA_NUM_THREADS,
-                                      0,
-                                      stream>>>(input.get<DataType::INT64>(),
-                                                output.get<DataType::DOUBLE>(),
-                                                weight.get<DataType::DOUBLE>(),
-                                                out_dim,
-                                                batch_size);
+      embed_forward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
       embed_forward_with_aggr<double>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -570,7 +580,7 @@ struct BackwardKernel<DataType::INT32, DataType::FLOAT> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<float>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -580,7 +590,7 @@ struct BackwardKernel<DataType::INT32, DataType::FLOAT> {
                        batch_size);
     } else {
       embed_backward_with_aggr<float>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -606,7 +616,7 @@ struct BackwardKernel<DataType::INT32, DataType::DOUBLE> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<double>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -616,7 +626,7 @@ struct BackwardKernel<DataType::INT32, DataType::DOUBLE> {
                        batch_size);
     } else {
       embed_backward_with_aggr<double>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -642,7 +652,7 @@ struct BackwardKernel<DataType::INT32, DataType::HALF> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<half>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -652,7 +662,7 @@ struct BackwardKernel<DataType::INT32, DataType::HALF> {
                        batch_size);
     } else {
       embed_backward_with_aggr<half>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -678,7 +688,7 @@ struct BackwardKernel<DataType::INT64, DataType::FLOAT> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<float>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -688,7 +698,7 @@ struct BackwardKernel<DataType::INT64, DataType::FLOAT> {
                        batch_size);
     } else {
       embed_backward_with_aggr<float>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -714,7 +724,7 @@ struct BackwardKernel<DataType::INT64, DataType::DOUBLE> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<double>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -724,7 +734,7 @@ struct BackwardKernel<DataType::INT64, DataType::DOUBLE> {
                        batch_size);
     } else {
       embed_backward_with_aggr<double>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -750,7 +760,7 @@ struct BackwardKernel<DataType::INT64, DataType::HALF> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<half>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -760,7 +770,7 @@ struct BackwardKernel<DataType::INT64, DataType::HALF> {
                        batch_size);
     } else {
       embed_backward_with_aggr<half>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu
index 1c6954a0b0..8aab79ba65 100644
--- a/lib/kernels/src/cuda/optimizer_kernels.cu
+++ b/lib/kernels/src/cuda/optimizer_kernels.cu
@@ -89,8 +89,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
         using T = std::decay_t<decltype(s)>;
         if constexpr (std::is_same_v<T, FlexFlow::ElementUnaryPerDeviceState> ||
                       std::is_same_v<T, FlexFlow::ReshapePerDeviceState> ||
-                      std::is_same_v<T, FlexFlow::TopKPerDeviceState> ||
-                      std::is_same_v<T, FlexFlow::TransposePerDeviceState>) {
+                      std::is_same_v<T, FlexFlow::TopKPerDeviceState>) {
           throw mk_runtime_error("State type does not support NCCL operations");
         } else {
           return s.handle.ncclComm;
@@ -209,8 +208,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
         using T = std::decay_t<decltype(s)>;
         if constexpr (std::is_same_v<T, FlexFlow::ElementUnaryPerDeviceState> ||
                       std::is_same_v<T, FlexFlow::ReshapePerDeviceState> ||
-                      std::is_same_v<T, FlexFlow::TopKPerDeviceState> ||
-                      std::is_same_v<T, FlexFlow::TransposePerDeviceState>) {
+                      std::is_same_v<T, FlexFlow::TopKPerDeviceState>) {
           throw mk_runtime_error("State type does not support NCCL operations");
         } else {
           return s.handle.ncclComm;
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index 2ac27a9747..c59d8eae3f 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -48,9 +48,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({10, 2}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({10_n, 2_n}, DataType::FLOAT);
     TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({10, 2}, DataType::DOUBLE);
+        make_tensor_shape_from_legion_dims({10_n, 2_n}, DataType::DOUBLE);
 
     // Only calling forward kernel as backward kernel is exactly the same
     SUBCASE("forward_kernel") {
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index 91f42669eb..97fa81920b 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -50,7 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({5, 5}, DataType::FLOAT);
+        make_tensor_shape_from_legion_dims({5_n, 5_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index ad74fa7d36..1a34c59be6 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -27,10 +27,6 @@ TEST_SUITE(FF_TEST_SUITE) {
     DropoutPerDeviceState state = Kernels::Dropout::init_kernel(
         managed_handle.raw_handle(), dropout_rate, seed, shape, allocator);
 
-    auto get_zero_count = [](std::vector<float> const &data) {
-      return count(data, [](float x) { return x == 0.0f; });
-    };
-
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
           create_random_filled_accessor_r(input_shape, allocator);
diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
index b22c683205..d081a0b07c 100644
--- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
+++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
@@ -28,13 +28,14 @@ TEST_SUITE(FF_TEST_SUITE) {
             /*allowTensorOpMathConversion=*/true};
         new_handle = std::move(base_handle);
 
-      CHECK(&base_handle.raw_handle() == nullptr);
-      CHECK(&new_handle.raw_handle() == base_handle_ptr);
-    }
+        CHECK(&base_handle.raw_handle() == nullptr);
+        CHECK(&new_handle.raw_handle() == base_handle_ptr);
+      }
 
-    SUBCASE("move assign to self") {
+      SUBCASE("move assign to self") {
         base_handle = std::move(base_handle);
         CHECK(&base_handle.raw_handle() == base_handle_ptr);
+      }
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index 5078edee57..f91c4959cc 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -20,7 +20,6 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       TensorShape output_shape =
-         
           make_tensor_shape_from_legion_dims({10_n}, DataType::FLOAT);
 
       GenericTensorAccessorR input_accessor =
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 357d1958c0..677f1f8f5e 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -41,7 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
                                           output_grad_accessor,
                                           input_grad_accessor,
-                                          num_replicas);
+                                          num_replicas.unwrap_nonnegative());
 
       CHECK(contains_non_zero(input_grad_accessor));
     }
@@ -52,8 +52,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_shape =
         make_tensor_shape_from_legion_dims({5_n}, DataType::FLOAT);
-    TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({5_n, num_replicas}, DataType::FLOAT);
+    TensorShape output_shape = make_tensor_shape_from_legion_dims(
+        {5_n, num_replicas}, DataType::FLOAT);
 
     ManagedPerDeviceFFHandle managed_handle{
         /*workSpaceSize=*/1024 * 1024,
@@ -95,7 +95,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
                                           output_grad_accessor_gpu,
                                           input_grad_accessor_gpu,
-                                          num_replicas);
+                                          num_replicas.unwrap_nonnegative());
 
       // Run CPU Replicate Backward Kernel
       GenericTensorAccessorR output_grad_accessor_cpu =
@@ -104,7 +104,9 @@ TEST_SUITE(FF_TEST_SUITE) {
           create_zero_filled_accessor_w(input_shape, cpu_allocator);
 
       Kernels::Replicate::cpu_backward_kernel(
-          output_grad_accessor_cpu, input_grad_accessor_cpu, num_replicas);
+          output_grad_accessor_cpu,
+          input_grad_accessor_cpu,
+          num_replicas.unwrap_nonnegative());
 
       CHECK(accessors_are_equal(input_grad_accessor_gpu,
                                 input_grad_accessor_cpu));
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index bf23188a8f..4628cbd371 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -88,13 +88,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorW output_accessor_gpu =
           create_zero_filled_accessor_w(output_shape, gpu_allocator);
 
-      Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
-                                       input_accessor_gpu.get_float_ptr(),
-                                       output_accessor_gpu.get_float_ptr(),
-                                       num_out_blks.unwrap_nonnegative(),
-                                       reverse_dim_size.unwrap_nonnegative(),
-                                       in_blk_size.unwrap_nonnegative(),
-                                       input_accessor_gpu.shape.num_elements().unwrap_nonnegative());
+      Kernels::Reverse::forward_kernel(
+          managed_stream.raw_stream(),
+          input_accessor_gpu.get_float_ptr(),
+          output_accessor_gpu.get_float_ptr(),
+          num_out_blks.unwrap_nonnegative(),
+          reverse_dim_size.unwrap_nonnegative(),
+          in_blk_size.unwrap_nonnegative(),
+          input_accessor_gpu.shape.num_elements().unwrap_nonnegative());
 
       // Run CPU Cast Forward Kernel
       GenericTensorAccessorR input_accessor_cpu =
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
index 70cca5f2f0..117c13a035 100644
--- a/lib/kernels/test/src/test_utils.cc
+++ b/lib/kernels/test/src/test_utils.cc
@@ -3,7 +3,7 @@
 #include "utils/join_strings.h"
 #include <random>
 
-using namespace ::FlexFlow;
+namespace FlexFlow {
 
 GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
                                                      Allocator &allocator) {
@@ -12,12 +12,11 @@ GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
   return result_accessor;
 }
 
-TensorShape
-    make_tensor_shape_from_legion_dims(LegionOrdered<size_t> const &dims,
-                                       DataType DT) {
+TensorShape make_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims,
+                                               DataType DT) {
   return TensorShape{
       TensorDims{
-          ff_ordered_from_legion_ordered(dims),
+          dims,
       },
       DT,
   };
@@ -35,7 +34,7 @@ struct CreateRandomFilledAccessorW {
 
     std::random_device rd;
     std::mt19937 gen(rd());
-    size_t num_elements = get_num_elements(shape);
+    size_t num_elements = get_num_elements(shape).unwrap_nonnegative();
     if constexpr (std::is_same<T, bool>::value) {
       std::bernoulli_distribution dist(0.5);
       for (size_t i = 0; i < num_elements; i++) {
@@ -80,10 +79,14 @@ struct FillWithZeros {
     using T = real_type_t<DT>;
 
     if (accessor.device_type == DeviceType::CPU) {
-      memset(accessor.ptr, 0, accessor.shape.get_volume() * sizeof(T));
+      memset(accessor.ptr,
+             0,
+             accessor.shape.get_volume().unwrap_nonnegative() * sizeof(T));
     } else {
-      checkCUDA(
-          cudaMemset(accessor.ptr, 0, accessor.shape.get_volume() * sizeof(T)));
+      checkCUDA(cudaMemset(accessor.ptr,
+                           0,
+                           accessor.shape.get_volume().unwrap_nonnegative() *
+                               sizeof(T)));
     }
   }
 };
@@ -142,8 +145,8 @@ template <DataType DT>
 struct Print2DCPUAccessorR {
   void operator()(GenericTensorAccessorR const &accessor,
                   std::ostream &stream) {
-    int rows = accessor.shape.at(legion_dim_t{0});
-    int cols = accessor.shape.at(legion_dim_t{1});
+    int rows = accessor.shape.at(legion_dim_t{0_n});
+    int cols = accessor.shape.at(legion_dim_t{1_n});
 
     std::vector<int> indices(cols);
     std::iota(indices.begin(), indices.end(), 0);
@@ -246,3 +249,4 @@ GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape,
       create_filled_accessor_w(shape, allocator, val);
   return read_only_accessor_from_write_accessor(w_accessor);
 }
+} // namespace FlexFlow
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
index a41bfc3aff..1d60562322 100644
--- a/lib/kernels/test/src/test_utils.h
+++ b/lib/kernels/test/src/test_utils.h
@@ -15,7 +15,7 @@
 #include <string>
 #include <vector>
 
-using namespace ::FlexFlow;
+namespace FlexFlow {
 
 GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
                                                        Allocator &allocator);
@@ -26,9 +26,8 @@ GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape,
 GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
                                                      Allocator &allocator);
 
-TensorShape
-    make_tensor_shape_from_legion_dims(LegionOrdered<size_t> const &dims,
-                                       DataType DT);
+TensorShape make_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims,
+                                               DataType DT);
 
 bool contains_non_zero(GenericTensorAccessorR const &accessor);
 
@@ -65,6 +64,8 @@ std::vector<T> repeat(std::size_t n, Func &&func) {
   return result;
 }
 
+} // namespace FlexFlow
+
 // Specialize doctest's StringMaker for std::vector<float>
 template <>
 struct doctest::StringMaker<std::vector<float>> {

From 311caf88033e6bfcf954f249b29a3946dd801668 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Sat, 8 Feb 2025 06:33:26 -0800
Subject: [PATCH 38/42] kernel issues

---
 lib/kernels/include/kernels/array_shape.h     |   2 +-
 .../include/kernels/reverse_kernels_cpu.h     |  11 +-
 lib/kernels/src/array_shape.cc                |  26 +++-
 lib/kernels/src/cpu/replicate_kernels.cc      |   6 +-
 lib/kernels/src/cpu/reverse_kernels.cc        |  49 ++++----
 lib/kernels/src/cuda/ops/concat_kernels.cu    |  82 +++++++------
 lib/kernels/src/cuda/ops/gather_kernels.cu    |  19 +--
 lib/kernels/src/cuda/ops/replicate_kernels.cu |   7 +-
 lib/kernels/src/cuda/ops/reverse_kernels.cu   |  48 ++------
 lib/kernels/src/legion_dim.cc                 |   4 +-
 lib/kernels/test/src/test_concat_kernel.cc    | 113 +++++++++++++-----
 lib/kernels/test/src/test_gather_kernels.cc   |  99 ++++++++++-----
 lib/kernels/test/src/test_replicate_kernel.cc |   2 +-
 lib/kernels/test/src/test_reverse_kernels.cc  |  29 +++--
 lib/kernels/test/src/test_utils.cc            |  41 ++++---
 lib/kernels/test/src/test_utils.h             |   6 +-
 .../utils/nonnegative_int/nonnegative_int.h   |   3 +
 .../utils/nonnegative_int/nonnegative_int.cc  |   9 ++
 18 files changed, 343 insertions(+), 213 deletions(-)

diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 57498ee466..72c746b8cc 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -15,7 +15,7 @@ namespace FlexFlow {
 struct ArrayShape {
 public:
   ArrayShape() = delete;
-  ArrayShape(nonnegative_int *dims, nonnegative_int num_dims);
+  ArrayShape(nonnegative_int const *dims, nonnegative_int num_dims);
   ArrayShape(TensorShape const &shape);
   ArrayShape(std::vector<nonnegative_int> const &);
 
diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h
index 35af06aafb..e482557f93 100644
--- a/lib/kernels/include/kernels/reverse_kernels_cpu.h
+++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h
@@ -7,10 +7,17 @@
 namespace FlexFlow::Kernels::Reverse {
 
 void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
-                        GenericTensorAccessorW &output_accessor);
+                        GenericTensorAccessorW &output_accessor,
+                        int num_out_blks,
+                        int reverse_dim_size,
+                        int in_blk_size);
 
 void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor,
-                         GenericTensorAccessorW &input_accessor);
+                         GenericTensorAccessorW &input_accessor,
+                         int num_out_blks,
+                         int reverse_dim_size,
+                         int in_blk_size);
+
 } // namespace FlexFlow::Kernels::Reverse
 
 #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index 54534f2ccf..30db65cc03 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -11,7 +11,7 @@ static LegionOrdered<nonnegative_int>
   return LegionOrdered<nonnegative_int>{reversed(vector_of(ff_ordered))};
 }
 
-ArrayShape::ArrayShape(nonnegative_int *_dims, nonnegative_int num_dims)
+ArrayShape::ArrayShape(nonnegative_int const *_dims, nonnegative_int num_dims)
     : dims(_dims, _dims + num_dims.unwrap_nonnegative()) {}
 
 ArrayShape::ArrayShape(TensorShape const &shape)
@@ -63,7 +63,29 @@ ArrayShape ArrayShape::sub_shape(
     std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
     std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
 
-  NOT_IMPLEMENTED();
+  nonnegative_int num_dims = this->num_dims();
+
+  auto to_legion_index = [num_dims](auto arg) -> nonnegative_int {
+    using T = std::decay_t<decltype(arg)>;
+    if constexpr (std::is_same_v<T, ff_dim_t>) {
+      return legion_dim_from_ff_dim(arg, num_dims).value;
+    } else {
+      return arg.value;
+    }
+  };
+
+  nonnegative_int start_idx =
+      (start.has_value()) ? std::visit(to_legion_index, start.value()) : 0_n;
+
+  nonnegative_int end_idx =
+      (end.has_value()) ? std::visit(to_legion_index, end.value()) : num_dims;
+
+  if (start_idx > num_dims || end_idx > num_dims || start_idx > end_idx) {
+    throw mk_runtime_error(fmt::format(
+        "Invalid sub_shape range: start={}, end={}", start_idx, end_idx));
+  }
+
+  return ArrayShape(&this->dims[legion_dim_t{start_idx}], end_idx - start_idx);
 }
 
 std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc
index 1e50cad4b4..cdb030d2ff 100644
--- a/lib/kernels/src/cpu/replicate_kernels.cc
+++ b/lib/kernels/src/cpu/replicate_kernels.cc
@@ -18,9 +18,10 @@ template <DataType DT>
 struct CPUBackwardKernel {
   void operator()(GenericTensorAccessorR const &output,
                   GenericTensorAccessorW &input,
+                  size_t num_elements,
                   size_t num_replicas) {
     using T = real_type_t<DT>;
-    for (int i = 0; i < input.shape.num_elements().unwrap_nonnegative(); i++) {
+    for (int i = 0; i < num_elements; i++) {
       T cur_sum = 0;
       for (int j = 0; j < num_replicas; j++) {
         cur_sum += output.at<DT>({i, j});
@@ -38,8 +39,9 @@ void cpu_forward_kernel(GenericTensorAccessorR const &input,
 void cpu_backward_kernel(GenericTensorAccessorR const &output,
                          GenericTensorAccessorW &input,
                          size_t num_replicas) {
+  size_t num_elements = input.shape.num_elements().unwrap_nonnegative();
   DataTypeDispatch1<CPUBackwardKernel>{}(
-      input.data_type, output, input, num_replicas);
+      input.data_type, output, input, num_elements, num_replicas);
 }
 
 } // namespace FlexFlow::Kernels::Replicate
diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc
index 848094cda7..e259d059ff 100644
--- a/lib/kernels/src/cpu/reverse_kernels.cc
+++ b/lib/kernels/src/cpu/reverse_kernels.cc
@@ -1,6 +1,5 @@
 #include "kernels/datatype_dispatch.h"
 #include "kernels/reverse_kernels_cpu.h"
-#include <algorithm>
 #include <vector>
 
 namespace FlexFlow::Kernels::Reverse {
@@ -8,21 +7,15 @@ namespace FlexFlow::Kernels::Reverse {
 template <DataType DT>
 struct CPUReverseForwardKernel {
   void operator()(GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW &output) {
-    assert(input.data_type == DT && output.data_type == DT);
-
-    int num_out_blocks = input.shape.at(legion_dim_t(0_n)).unwrap_nonnegative();
-    int reverse_dim_size =
-        input.shape.at(legion_dim_t(1_n)).unwrap_nonnegative();
-    int in_block_size = input.shape.at(legion_dim_t(2_n)).unwrap_nonnegative();
-
-    for (int block_idx = 0; block_idx < num_out_blocks; block_idx++) {
+                  GenericTensorAccessorW &output,
+                  int num_out_blks,
+                  int reverse_dim_size,
+                  int in_blk_size) {
+    for (int blk_idx = 0; blk_idx < num_out_blks; blk_idx++) {
       for (int rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) {
-        for (int i = 0; i < in_block_size; i++) {
-          output.at<DT>({block_idx, rev_idx, i}) =
-              input.at<DT>({num_out_blocks - 1 - block_idx,
-                            reverse_dim_size - 1 - rev_idx,
-                            in_block_size - 1 - i});
+        for (int inner_idx = 0; inner_idx < in_blk_size; inner_idx++) {
+          output.at<DT>({inner_idx, rev_idx, blk_idx}) = input.at<DT>(
+              {inner_idx, reverse_dim_size - 1 - rev_idx, blk_idx});
         }
       }
     }
@@ -30,15 +23,29 @@ struct CPUReverseForwardKernel {
 };
 
 void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
-                        GenericTensorAccessorW &output_accessor) {
-  DataTypeDispatch1<CPUReverseForwardKernel>{}(
-      input_accessor.data_type, input_accessor, output_accessor);
+                        GenericTensorAccessorW &output_accessor,
+                        int num_out_blks,
+                        int reverse_dim_size,
+                        int in_blk_size) {
+  DataTypeDispatch1<CPUReverseForwardKernel>{}(input_accessor.data_type,
+                                               input_accessor,
+                                               output_accessor,
+                                               num_out_blks,
+                                               reverse_dim_size,
+                                               in_blk_size);
 }
 
 void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor,
-                         GenericTensorAccessorW &input_accessor) {
-  DataTypeDispatch1<CPUReverseForwardKernel>{}(
-      output_accessor.data_type, output_accessor, input_accessor);
+                         GenericTensorAccessorW &input_accessor,
+                         int num_out_blks,
+                         int reverse_dim_size,
+                         int in_blk_size) {
+  DataTypeDispatch1<CPUReverseForwardKernel>{}(output_accessor.data_type,
+                                               output_accessor,
+                                               input_accessor,
+                                               num_out_blks,
+                                               reverse_dim_size,
+                                               in_blk_size);
 }
 
 } // namespace FlexFlow::Kernels::Reverse
diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu
index aa442f5c3d..683dbbaac5 100644
--- a/lib/kernels/src/cuda/ops/concat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/concat_kernels.cu
@@ -23,38 +23,48 @@ void calc_blk_size(size_t &num_blocks,
                    size_t &blk_size,
                    ArrayShape const &shape,
                    ff_dim_t axis) {
-  blk_size = shape.sub_shape(legion_dim_t{0_n}, axis)
+  legion_dim_t legion_axis = (legion_dim_from_ff_dim(axis, shape.num_dims()));
+  assert(legion_axis.value < shape.num_dims());
+  if (legion_axis.value == 0_n) {
+    legion_axis.value = 1_n;
+  }
+  blk_size = shape.sub_shape(legion_dim_t{0_n}, legion_axis)
                  .num_elements()
                  .unwrap_nonnegative();
-  num_blocks =
-      shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative();
+  num_blocks = shape.sub_shape(legion_axis, std::nullopt)
+                   .num_elements()
+                   .unwrap_nonnegative();
 }
 
 void forward_kernel(cudaStream_t stream,
                     GenericTensorAccessorW const &output,
                     std::vector<GenericTensorAccessorR> const &inputs,
                     ff_dim_t axis) {
-  size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS];
-  int num_inputs = inputs.size();
-  assert(num_inputs <= MAX_NUM_INPUTS);
+  assert(inputs.size() <= MAX_NUM_INPUTS);
+  size_t num_blocks = 1, output_blk_size = 1;
   calc_blk_size(num_blocks, output_blk_size, output.shape, axis);
-  for (int i = 0; i < num_inputs; i++) {
-    size_t input_num_blocks = 1;
-    calc_blk_size(input_num_blocks, input_blk_sizes[i], inputs[i].shape, axis);
-    assert(input_num_blocks == num_blocks);
-  }
-
   off_t offset = 0;
-  for (int i = 0; i < num_inputs; i++) {
-    copy_with_stride<<<GET_BLOCKS(input_blk_sizes[i] * num_blocks),
+
+  for (auto const &input : inputs) {
+    size_t input_num_blocks = 1, input_blk_size = 1;
+    calc_blk_size(input_num_blocks, input_blk_size, input.shape, axis);
+    assert(input_num_blocks == num_blocks || output_blk_size == input_blk_size);
+
+    int blocks_to_copy =
+        (output_blk_size == input_blk_size) ? input_num_blocks : num_blocks;
+
+    copy_with_stride<<<GET_BLOCKS(input_blk_size * num_blocks),
                        CUDA_NUM_THREADS,
                        0,
                        stream>>>(output.get_float_ptr() + offset,
-                                 inputs[i].get_float_ptr(),
-                                 num_blocks,
+                                 input.get_float_ptr(),
+                                 blocks_to_copy,
                                  output_blk_size,
-                                 input_blk_sizes[i]);
-    offset += input_blk_sizes[i];
+                                 input_blk_size);
+
+    offset += (output_blk_size == input_blk_size)
+                  ? input_blk_size * input_num_blocks
+                  : input_blk_size;
   }
 }
 
@@ -62,29 +72,31 @@ void backward_kernel(cudaStream_t stream,
                      GenericTensorAccessorR const &output_grad,
                      std::vector<GenericTensorAccessorW> const &input_grads,
                      ff_dim_t axis) {
-  size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS];
-  int num_inputs = input_grads.size();
-  assert(num_inputs <= MAX_NUM_INPUTS);
-
+  assert(input_grads.size() <= MAX_NUM_INPUTS);
+  size_t num_blocks = 1, output_blk_size = 1;
   calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis);
-  for (int i = 0; i < num_inputs; i++) {
-    ArrayShape shape = input_grads[i].shape;
-    size_t input_num_blocks = 1;
-    calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, axis);
-    assert(input_num_blocks == num_blocks);
-  }
-
   off_t offset = 0;
-  for (int i = 0; i < num_inputs; i++) {
-    add_with_stride<<<GET_BLOCKS(input_blk_sizes[i] * num_blocks),
+
+  for (auto &input_grad : input_grads) {
+    size_t input_num_blocks = 1, input_blk_size = 1;
+    calc_blk_size(input_num_blocks, input_blk_size, input_grad.shape, axis);
+    assert(input_num_blocks == num_blocks || output_blk_size == input_blk_size);
+
+    int blocks_to_add =
+        (output_blk_size == input_blk_size) ? input_num_blocks : num_blocks;
+
+    add_with_stride<<<GET_BLOCKS(input_blk_size * num_blocks),
                       CUDA_NUM_THREADS,
                       0,
-                      stream>>>(input_grads[i].get_float_ptr(),
+                      stream>>>(input_grad.get_float_ptr(),
                                 output_grad.get_float_ptr() + offset,
-                                num_blocks,
-                                input_blk_sizes[i],
+                                blocks_to_add,
+                                input_blk_size,
                                 output_blk_size);
-    offset += input_blk_sizes[i];
+
+    offset += (output_blk_size == input_blk_size)
+                  ? input_blk_size * input_num_blocks
+                  : input_blk_size;
   }
 }
 
diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu
index 31c1bac217..99034089b5 100644
--- a/lib/kernels/src/cuda/ops/gather_kernels.cu
+++ b/lib/kernels/src/cuda/ops/gather_kernels.cu
@@ -18,9 +18,7 @@
 #include "kernels/device.h"
 #include "kernels/gather_kernels.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Gather {
+namespace FlexFlow::Kernels::Gather {
 
 template <typename IndexType>
 __global__ void gather_forward(float const *input,
@@ -125,11 +123,14 @@ void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &index,
                     GenericTensorAccessorW const &output) {
   checkCUDA(get_legion_stream(&stream));
-
   coord_t stride =
       output.shape.sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1))
           .num_elements()
           .unwrap_nonnegative();
+  if (m.legion_dim.value == 0_n) {
+    stride = 1;
+  }
+
   coord_t output_dim_size = output.shape.at(m.legion_dim).unwrap_nonnegative();
   coord_t input_dim_size = input.shape.at(m.legion_dim).unwrap_nonnegative();
 
@@ -158,8 +159,12 @@ void backward_kernel(ffStream_t stream,
   coord_t stride =
       output_grad.shape
           .sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1))
-          .get_volume()
+          .num_elements()
           .unwrap_nonnegative();
+  if (m.legion_dim.value == 0_n) {
+    stride = 1;
+  }
+
   coord_t output_dim_size =
       output_grad.shape.at(m.legion_dim).unwrap_nonnegative();
   coord_t input_dim_size =
@@ -180,6 +185,4 @@ void backward_kernel(ffStream_t stream,
       output_dim_size);
 }
 
-} // namespace Gather
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Gather
diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu
index b4fa5edb89..78022e869b 100644
--- a/lib/kernels/src/cuda/ops/replicate_kernels.cu
+++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu
@@ -22,8 +22,8 @@ namespace Kernels {
 namespace Replicate {
 
 template <typename T>
-__global__ void replicate_backward_kernel(T *input_ptr,
-                                          T const *output_ptr,
+__global__ void replicate_backward_kernel(T const *output_ptr,
+                                          T *input_ptr,
                                           size_t num_elements,
                                           size_t num_replicas) {
   CUDA_KERNEL_LOOP(i, num_elements) {
@@ -38,7 +38,6 @@ struct ForwardKernel {
   void operator()(cudaStream_t stream,
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-
     checkCUDA(cudaMemcpyAsync((void *)output.get<T>(),
                               (void *)input.get<T>(),
                               input.shape.num_elements().unwrap_nonnegative() *
@@ -58,8 +57,8 @@ struct BackwardKernel {
         input.shape.num_elements().unwrap_nonnegative() * num_replicas;
     replicate_backward_kernel<real_type_t<T>>
         <<<GET_BLOCKS(total_elements), CUDA_NUM_THREADS, 0, stream>>>(
-            input.get<T>(),
             output.get<T>(),
+            input.get<T>(),
             input.shape.num_elements().unwrap_nonnegative(),
             num_replicas);
   }
diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu
index 6469dfc735..367e337b18 100644
--- a/lib/kernels/src/cuda/ops/reverse_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu
@@ -20,54 +20,20 @@ namespace FlexFlow {
 namespace Kernels {
 namespace Reverse {
 
-// __global__ void reverse_forward_kernel(float const *in_ptr,
-//                                        float *out_ptr,
-//                                        coord_t num_out_blks,
-//                                        coord_t reverse_dim_size,
-//                                        coord_t in_blk_size) {
-//   CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
-//     coord_t out_idx = i;
-//     coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
-//     i = i - blk_idx * (reverse_dim_size * in_blk_size);
-//     coord_t reverse_dim_idx = i / in_blk_size;
-//     i = i - reverse_dim_idx * in_blk_size;
-//     coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
-//                      (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size +
-//                      i;
-//     out_ptr[out_idx] = in_ptr[in_idx];
-//   }
-// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
-//   coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
-//   i = i - blk_idx * (reverse_dim_size * in_blk_size);
-//   coord_t reverse_dim_idx = i / in_blk_size;
-//   i = i - reverse_dim_idx * in_blk_size;
-//   coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
-//                    (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size +
-//                    i;
-//   out_ptr[i] = in_ptr[in_idx];
-// }
-// }
-
-/* I mentioned this earlier, but I still think the reverse_forward_kernel code
-   is incorrect, even though it matches the code in inference/master? Whenever
-   I'm testing the code and printing out the output, I'm getting unexpected
-   outputs, and I think it's a result of modifying the loop index i in the
-   previous code?
-*/
 __global__ void reverse_forward_kernel(float const *in_ptr,
                                        float *out_ptr,
                                        coord_t num_out_blks,
                                        coord_t reverse_dim_size,
                                        coord_t in_blk_size) {
   CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
+    coord_t out_idx = i;
     coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
-    coord_t idx_within_blk = i % (reverse_dim_size * in_blk_size);
-    coord_t reverse_dim_idx = idx_within_blk / in_blk_size;
-    coord_t in_idx = idx_within_blk % in_blk_size;
-    coord_t input_index =
-        blk_idx * (reverse_dim_size * in_blk_size) +
-        (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + in_idx;
-    out_ptr[i] = in_ptr[input_index];
+    i = i - blk_idx * (reverse_dim_size * in_blk_size);
+    coord_t reverse_dim_idx = i / in_blk_size;
+    i = i - reverse_dim_idx * in_blk_size;
+    coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
+                     (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i;
+    out_ptr[out_idx] = in_ptr[in_idx];
   }
 }
 
diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc
index bbb15c5636..4e7fc56848 100644
--- a/lib/kernels/src/legion_dim.cc
+++ b/lib/kernels/src/legion_dim.cc
@@ -9,8 +9,8 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) {
 
 legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim,
                                     nonnegative_int num_dimensions) {
-  return legion_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() -
-                                      ff_dim.value.unwrap_nonnegative() - 1}};
+  return legion_dim_t{num_dimensions - ff_dim.value - 1_n};
+  ;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index 3587cecedd..22da72912a 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -6,49 +6,96 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test concat kernel forward and backward") {
-    nonnegative_int num_inputs = 2_n;
-    nonnegative_int size_per_input = 10_n;
-    ff_dim_t concat_axis = ff_dim_t{1_n};
-
     ManagedPerDeviceFFHandle managed_handle{
         /*workSpaceSize=*/1024 * 1024,
         /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
-
-    TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({size_per_input}, DataType::FLOAT);
-    TensorShape output_shape = make_tensor_shape_from_legion_dims(
-        {num_inputs, size_per_input}, DataType::FLOAT);
-
     Allocator allocator = create_local_cuda_memory_allocator();
 
+    const nonnegative_int num_inputs = 4_n;
+
     SUBCASE("forward_kernel") {
-      std::vector<GenericTensorAccessorR> input_accessors =
-          repeat(num_inputs, [&]() {
-            return read_only_accessor_from_write_accessor(
-                create_random_filled_accessor_w(input_shape, allocator));
-          });
-      GenericTensorAccessorW output_accessor =
-          allocator.allocate_tensor(output_shape);
-
-      Kernels::Concat::forward_kernel(managed_stream.raw_stream(),
-                                      output_accessor,
-                                      input_accessors,
-                                      concat_axis);
-
-      CHECK(contains_non_zero(output_accessor));
+      auto run_forward_test = [&](nonnegative_int input_rows,
+                                  nonnegative_int input_cols,
+                                  TensorShape output_shape,
+                                  ff_dim_t concat_axis) {
+        TensorShape input_shape = make_tensor_shape_from_legion_dims(
+            {input_rows, input_cols}, DataType::FLOAT);
+
+        std::vector<GenericTensorAccessorR> input_accessors =
+            repeat(num_inputs, [&]() {
+              return create_random_filled_accessor_r(input_shape, allocator);
+            });
+
+        GenericTensorAccessorW output_accessor =
+            allocator.allocate_tensor(output_shape);
+
+        Kernels::Concat::forward_kernel(managed_stream.raw_stream(),
+                                        output_accessor,
+                                        input_accessors,
+                                        concat_axis);
+
+        CHECK(contains_non_zero(output_accessor));
+      };
+
+      SUBCASE("test forward concat, axis = 0") {
+        nonnegative_int input_rows = 2_n;
+        nonnegative_int input_cols = 4_n;
+        TensorShape output_shape = make_tensor_shape_from_legion_dims(
+            {num_inputs * input_rows, input_cols}, DataType::FLOAT);
+        run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n});
+      }
+
+      SUBCASE("test forward concat, axis = 1") {
+        nonnegative_int input_rows = 4_n;
+        nonnegative_int input_cols = 2_n;
+        TensorShape output_shape = make_tensor_shape_from_legion_dims(
+            {input_rows, num_inputs * input_cols}, DataType::FLOAT);
+        run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n});
+      }
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad_accessor =
-          create_random_filled_accessor_r(output_shape, allocator);
-      std::vector<GenericTensorAccessorW> input_grad_accessors = repeat(
-          num_inputs, [&]() { return allocator.allocate_tensor(input_shape); });
-
-      Kernels::Concat::backward_kernel(managed_stream.raw_stream(),
-                                       output_grad_accessor,
-                                       input_grad_accessors,
-                                       concat_axis);
+      auto run_backward_test = [&](nonnegative_int input_rows,
+                                   nonnegative_int input_cols,
+                                   TensorShape output_shape,
+                                   ff_dim_t concat_axis) {
+        TensorShape input_shape = make_tensor_shape_from_legion_dims(
+            {input_rows, input_cols}, DataType::FLOAT);
+
+        GenericTensorAccessorR output_grad_accessor =
+            create_random_filled_accessor_r(output_shape, allocator);
+
+        std::vector<GenericTensorAccessorW> input_grad_accessors =
+            repeat(num_inputs, [&]() {
+              return create_zero_filled_accessor_w(input_shape, allocator);
+            });
+
+        Kernels::Concat::backward_kernel(managed_stream.raw_stream(),
+                                         output_grad_accessor,
+                                         input_grad_accessors,
+                                         concat_axis);
+
+        for (auto &accessor : input_grad_accessors) {
+          CHECK(contains_non_zero(accessor));
+        }
+      };
+
+      SUBCASE("test backward concat, axis = 0") {
+        nonnegative_int input_rows = 2_n;
+        nonnegative_int input_cols = 4_n;
+        TensorShape output_shape = make_tensor_shape_from_legion_dims(
+            {num_inputs * input_rows, input_cols}, DataType::FLOAT);
+        run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n});
+      }
+
+      SUBCASE("test backward concat, axis = 1") {
+        nonnegative_int input_rows = 4_n;
+        nonnegative_int input_cols = 2_n;
+        TensorShape output_shape = make_tensor_shape_from_legion_dims(
+            {input_rows, num_inputs * input_cols}, DataType::FLOAT);
+        run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n});
+      }
     }
   }
 }
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index b75614588c..043617c790 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -3,54 +3,87 @@
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
+
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Gather Forward and Backward Kernel") {
     ManagedPerDeviceFFHandle managed_handle{
         /*workSpaceSize=*/1024 * 1024,
         /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
-
     Allocator allocator = create_local_cuda_memory_allocator();
 
     GatherPerDeviceState state = {managed_handle.raw_handle(),
-                                  legion_dim_t{2_n}};
+                                  legion_dim_t{0_n}};
 
-    TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
-    TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({50_n}, DataType::FLOAT);
+    SUBCASE("forward_kernel") {
+      auto run_forward_test = [&](TensorShape input_shape,
+                                  TensorShape index_shape,
+                                  TensorShape output_shape) {
+        GenericTensorAccessorR input_accessor =
+            create_random_filled_accessor_r(input_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW output_accessor =
+            allocator.allocate_tensor(output_shape);
 
-    GenericTensorAccessorR index_accessor =
-        create_random_filled_accessor_r(output_shape, allocator);
+        Kernels::Gather::forward_kernel(managed_stream.raw_stream(),
+                                        state,
+                                        input_accessor,
+                                        index_accessor,
+                                        output_accessor);
 
-    SUBCASE("forward_kernel") {
-      GenericTensorAccessorR input_accessor =
-          create_random_filled_accessor_r(input_shape, allocator);
-      GenericTensorAccessorW output_accessor =
-          allocator.allocate_tensor(output_shape);
-
-      Kernels::Gather::forward_kernel(managed_stream.raw_stream(),
-                                      state,
-                                      input_accessor,
-                                      index_accessor,
-                                      output_accessor);
-
-      CHECK(contains_non_zero(output_accessor));
+        CHECK(contains_non_zero(output_accessor));
+      };
+
+      SUBCASE("test gather forward, 2D") {
+        TensorShape input_shape =
+            make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT);
+        TensorShape index_shape =
+            make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::INT32);
+        TensorShape output_shape =
+            make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::FLOAT);
+        run_forward_test(input_shape, index_shape, output_shape);
+      }
+
+      SUBCASE("test gather forward, 1D") {
+        TensorShape input_shape =
+            make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
+        TensorShape index_shape =
+            make_tensor_shape_from_legion_dims({10_n}, DataType::INT32);
+        TensorShape output_shape =
+            make_tensor_shape_from_legion_dims({10_n}, DataType::FLOAT);
+        run_forward_test(input_shape, index_shape, output_shape);
+      }
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad_accessor =
-          create_random_filled_accessor_r(output_shape, allocator);
-      GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
-
-      Kernels::Gather::backward_kernel(managed_stream.raw_stream(),
-                                       state,
-                                       output_grad_accessor,
-                                       index_accessor,
-                                       input_grad_accessor);
-
-      CHECK(contains_non_zero(input_grad_accessor));
+      auto run_backward_test = [&](TensorShape input_shape,
+                                   TensorShape index_shape,
+                                   TensorShape output_shape) {
+        GenericTensorAccessorR output_grad_accessor =
+            create_random_filled_accessor_r(output_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW input_grad_accessor =
+            allocator.allocate_tensor(input_shape);
+
+        Kernels::Gather::backward_kernel(managed_stream.raw_stream(),
+                                         state,
+                                         output_grad_accessor,
+                                         index_accessor,
+                                         input_grad_accessor);
+        CHECK(contains_non_zero(input_grad_accessor));
+      };
+
+      SUBCASE("test gather backward, 2D") {
+        TensorShape input_shape =
+            make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT);
+        TensorShape index_shape =
+            make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::INT32);
+        TensorShape output_shape =
+            make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::FLOAT);
+        run_backward_test(input_shape, index_shape, output_shape);
+      }
     }
   }
 }
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 677f1f8f5e..87834d83ac 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -53,7 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape input_shape =
         make_tensor_shape_from_legion_dims({5_n}, DataType::FLOAT);
     TensorShape output_shape = make_tensor_shape_from_legion_dims(
-        {5_n, num_replicas}, DataType::FLOAT);
+        {num_replicas, 5_n}, DataType::FLOAT);
 
     ManagedPerDeviceFFHandle managed_handle{
         /*workSpaceSize=*/1024 * 1024,
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index 4628cbd371..481958fdfc 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -7,9 +7,9 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Call Reverse Forward and Backward Kernels") {
+    nonnegative_int num_out_blks = 1_n;
     nonnegative_int reverse_dim_size = 10_n;
     nonnegative_int in_blk_size = 10_n;
-    nonnegative_int num_out_blks = 1_n;
 
     TensorShape input_shape = make_tensor_shape_from_legion_dims(
         {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT);
@@ -61,9 +61,9 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") {
-    nonnegative_int num_out_blks = 4_n;
-    nonnegative_int reverse_dim_size = 3_n;
-    nonnegative_int in_blk_size = 2_n;
+    nonnegative_int num_out_blks = 1_n;
+    nonnegative_int reverse_dim_size = 4_n;
+    nonnegative_int in_blk_size = 3_n;
 
     TensorShape input_shape = make_tensor_shape_from_legion_dims(
         {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT);
@@ -78,10 +78,6 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     SUBCASE("forward_kernel") {
-      auto transform = [counter = 0.0f](float val) mutable {
-        return counter++;
-      };
-
       // Run GPU Cast Forward Kernel
       GenericTensorAccessorR input_accessor_gpu =
           create_random_filled_accessor_r(input_shape, gpu_allocator);
@@ -103,8 +99,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorW output_accessor_cpu =
           create_zero_filled_accessor_w(output_shape, cpu_allocator);
 
-      Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu,
-                                           output_accessor_cpu);
+      Kernels::Reverse::cpu_forward_kernel(
+          input_accessor_cpu,
+          output_accessor_cpu,
+          num_out_blks.unwrap_nonnegative(),
+          reverse_dim_size.unwrap_nonnegative(),
+          in_blk_size.unwrap_nonnegative());
 
       CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu));
     }
@@ -113,6 +113,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       // Run GPU Cast Backward Kernel
       GenericTensorAccessorR output_grad_accessor_gpu =
           create_random_filled_accessor_r(output_shape, gpu_allocator);
+
       GenericTensorAccessorW input_grad_accessor_gpu =
           create_zero_filled_accessor_w(input_shape, gpu_allocator);
 
@@ -131,8 +132,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorW input_grad_accessor_cpu =
           create_zero_filled_accessor_w(input_shape, cpu_allocator);
 
-      Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu,
-                                            input_grad_accessor_cpu);
+      Kernels::Reverse::cpu_backward_kernel(
+          output_grad_accessor_cpu,
+          input_grad_accessor_cpu,
+          num_out_blks.unwrap_nonnegative(),
+          reverse_dim_size.unwrap_nonnegative(),
+          in_blk_size.unwrap_nonnegative());
 
       CHECK(accessors_are_equal(input_grad_accessor_gpu,
                                 input_grad_accessor_cpu));
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
index 117c13a035..bc5f48654a 100644
--- a/lib/kernels/test/src/test_utils.cc
+++ b/lib/kernels/test/src/test_utils.cc
@@ -5,13 +5,6 @@
 
 namespace FlexFlow {
 
-GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
-                                                     Allocator &allocator) {
-  GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape);
-  fill_with_zeros(result_accessor);
-  return result_accessor;
-}
-
 TensorShape make_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims,
                                                DataType DT) {
   return TensorShape{
@@ -22,6 +15,20 @@ TensorShape make_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims,
   };
 }
 
+GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
+                                                     Allocator &allocator) {
+  GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape);
+  fill_with_zeros(result_accessor);
+  return result_accessor;
+}
+
+GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape,
+                                                     Allocator &allocator) {
+  GenericTensorAccessorW accessor =
+      create_zero_filled_accessor_w(shape, allocator);
+  return read_only_accessor_from_write_accessor(accessor);
+}
+
 template <DataType DT>
 struct CreateRandomFilledAccessorW {
   GenericTensorAccessorW operator()(TensorShape const &shape,
@@ -46,7 +53,7 @@ struct CreateRandomFilledAccessorW {
         data_ptr[i] = dist(gen);
       }
     } else if constexpr (std::is_integral<T>::value) {
-      std::uniform_int_distribution<T> dist(0, 100);
+      std::uniform_int_distribution<T> dist(0, 99);
       for (size_t i = 0; i < num_elements; i++) {
         data_ptr[i] = dist(gen);
       }
@@ -145,15 +152,19 @@ template <DataType DT>
 struct Print2DCPUAccessorR {
   void operator()(GenericTensorAccessorR const &accessor,
                   std::ostream &stream) {
-    int rows = accessor.shape.at(legion_dim_t{0_n});
-    int cols = accessor.shape.at(legion_dim_t{1_n});
+    int const dims = accessor.shape.num_dims();
+    int const cols = accessor.shape.at(legion_dim_t{0_n});
+    int const rows = (dims == 2) ? accessor.shape.at(legion_dim_t{1_n}) : 1_n;
+
+    auto get_element = [dims, &accessor](int j, int i) {
+      return (dims == 1) ? accessor.at<DT>({j}) : accessor.at<DT>({j, i});
+    };
 
     std::vector<int> indices(cols);
     std::iota(indices.begin(), indices.end(), 0);
-
-    for (int i = 0; i < rows; i++) {
-      stream << join_strings(indices, " ", [&](int k) {
-        return accessor.at<DT>({i, k});
+    for (int i = 0; i < rows; ++i) {
+      stream << join_strings(indices, " ", [=](int j) {
+        return get_element(j, i);
       }) << std::endl;
     }
   }
@@ -165,7 +176,7 @@ void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor,
   GenericTensorAccessorR cpu_accessor =
       copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator);
   DataTypeDispatch1<Print2DCPUAccessorR>{}(
-      accessor.data_type, accessor, stream);
+      accessor.data_type, cpu_accessor, stream);
 }
 
 template <DataType DT>
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
index 1d60562322..093a9a4a97 100644
--- a/lib/kernels/test/src/test_utils.h
+++ b/lib/kernels/test/src/test_utils.h
@@ -26,6 +26,9 @@ GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape,
 GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
                                                      Allocator &allocator);
 
+GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape,
+                                                     Allocator &allocator);
+
 TensorShape make_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims,
                                                DataType DT);
 
@@ -41,7 +44,8 @@ GenericTensorAccessorR
     copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor,
                                         Allocator &allocator);
 
-void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor);
+void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor,
+                                       std::ostream &stream);
 
 bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
                          GenericTensorAccessorR const &accessor_b);
diff --git a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h
index 0bcc8cfd6f..150fb9ba8b 100644
--- a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h
+++ b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h
@@ -43,6 +43,9 @@ class nonnegative_int {
   nonnegative_int operator++(int);
   nonnegative_int &operator+=(nonnegative_int const &other);
 
+  nonnegative_int operator-(nonnegative_int const &other) const;
+  nonnegative_int &operator-=(nonnegative_int const &other);
+
   nonnegative_int operator*(nonnegative_int const &other) const;
   nonnegative_int &operator*=(nonnegative_int const &other);
 
diff --git a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc
index e86c242250..75e2a349ec 100644
--- a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc
+++ b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc
@@ -101,6 +101,15 @@ nonnegative_int &nonnegative_int::operator+=(nonnegative_int const &other) {
   return *this;
 }
 
+nonnegative_int nonnegative_int::operator-(nonnegative_int const &other) const {
+  return nonnegative_int{this->value_ - other.value_};
+}
+
+nonnegative_int &nonnegative_int::operator-=(nonnegative_int const &other) {
+  *this = nonnegative_int{this->value_ - other.value_};
+  return *this;
+}
+
 nonnegative_int nonnegative_int::operator*(nonnegative_int const &other) const {
   return nonnegative_int{this->value_ * other.value_};
 }

From 157407d3f0b8511d2cac18e7c3d7ac1a60816c42 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Sun, 9 Feb 2025 20:44:42 -0800
Subject: [PATCH 39/42] managed stream / handle test case fix

---
 .../test/src/test_managed_ff_stream.cc        | 97 ++++++++++++++-----
 .../src/test_managed_per_device_ff_handle.cc  |  6 +-
 2 files changed, 76 insertions(+), 27 deletions(-)

diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
index 3535dd258c..87b564d284 100644
--- a/lib/kernels/test/src/test_managed_ff_stream.cc
+++ b/lib/kernels/test/src/test_managed_ff_stream.cc
@@ -1,36 +1,89 @@
 #include "doctest/doctest.h"
-#include "kernels/managed_ff_stream.h"
+#include "kernels/gather_kernels.h"
+#include "test_utils.h"
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("ManagedFFStream") {
-    ManagedFFStream base_stream{};
-    ffStream_t const *base_stream_ptr = &base_stream.raw_stream();
-
-    SUBCASE("move constructor") {
-      ManagedFFStream new_stream(std::move(base_stream));
-      CHECK(&base_stream.raw_stream() == nullptr);
-      CHECK(&new_stream.raw_stream() == base_stream_ptr);
-    }
+  TEST_CASE("Test ManagedFFStream") {
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
+    ManagedFFStream managed_stream{};
+    Allocator allocator = create_local_cuda_memory_allocator();
+
+    GatherPerDeviceState state = {managed_handle.raw_handle(),
+                                  legion_dim_t{0_n}};
+
+    SUBCASE("forward_kernel") {
+      auto run_forward_test = [&](TensorShape input_shape,
+                                  TensorShape index_shape,
+                                  TensorShape output_shape) {
+        GenericTensorAccessorR input_accessor =
+            create_random_filled_accessor_r(input_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW output_accessor =
+            allocator.allocate_tensor(output_shape);
+
+        Kernels::Gather::forward_kernel(managed_stream.raw_stream(),
+                                        state,
+                                        input_accessor,
+                                        index_accessor,
+                                        output_accessor);
 
-    SUBCASE("move assignment operator") {
-      SUBCASE("move assign to other") {
-        ManagedFFStream new_stream{};
-        new_stream = std::move(base_stream);
-        CHECK(&base_stream.raw_stream() == nullptr);
-        CHECK(&new_stream.raw_stream() == base_stream_ptr);
+        CHECK(contains_non_zero(output_accessor));
+      };
+
+      SUBCASE("test gather forward, 2D") {
+        TensorShape input_shape =
+            make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT);
+        TensorShape index_shape =
+            make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::INT32);
+        TensorShape output_shape =
+            make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::FLOAT);
+        run_forward_test(input_shape, index_shape, output_shape);
       }
 
-      SUBCASE("move assign to self") {
-        base_stream = std::move(base_stream);
-        CHECK(&base_stream.raw_stream() == base_stream_ptr);
+      SUBCASE("test gather forward, 1D") {
+        TensorShape input_shape =
+            make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
+        TensorShape index_shape =
+            make_tensor_shape_from_legion_dims({10_n}, DataType::INT32);
+        TensorShape output_shape =
+            make_tensor_shape_from_legion_dims({10_n}, DataType::FLOAT);
+        run_forward_test(input_shape, index_shape, output_shape);
       }
     }
 
-    SUBCASE("Test Self-Assignment") {
-      base_stream = std::move(base_stream);
-      CHECK(&base_stream.raw_stream() == base_stream_ptr);
+    SUBCASE("backward_kernel") {
+      auto run_backward_test = [&](TensorShape input_shape,
+                                   TensorShape index_shape,
+                                   TensorShape output_shape) {
+        GenericTensorAccessorR output_grad_accessor =
+            create_random_filled_accessor_r(output_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW input_grad_accessor =
+            allocator.allocate_tensor(input_shape);
+
+        Kernels::Gather::backward_kernel(managed_stream.raw_stream(),
+                                         state,
+                                         output_grad_accessor,
+                                         index_accessor,
+                                         input_grad_accessor);
+        CHECK(contains_non_zero(input_grad_accessor));
+      };
+
+      SUBCASE("test gather backward, 2D") {
+        TensorShape input_shape =
+            make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT);
+        TensorShape index_shape =
+            make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::INT32);
+        TensorShape output_shape =
+            make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::FLOAT);
+        run_backward_test(input_shape, index_shape, output_shape);
+      }
     }
   }
 }
diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
index d081a0b07c..5902664a14 100644
--- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
+++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
@@ -4,7 +4,7 @@
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("ManagedPerDeviceFFHandle") {
+  TEST_CASE("Test ManagedPerDeviceFFHandle") {
     ManagedPerDeviceFFHandle base_handle{/*workSpaceSize=*/1024 * 1024,
                                          /*allowTensorOpMathConversion=*/true};
     PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle();
@@ -16,8 +16,6 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("move constructor") {
       ManagedPerDeviceFFHandle new_handle(std::move(base_handle));
-
-      CHECK(&base_handle.raw_handle() == nullptr);
       CHECK(&new_handle.raw_handle() == base_handle_ptr);
     }
 
@@ -27,8 +25,6 @@ TEST_SUITE(FF_TEST_SUITE) {
             /*workSpaceSize=*/1024 * 1024,
             /*allowTensorOpMathConversion=*/true};
         new_handle = std::move(base_handle);
-
-        CHECK(&base_handle.raw_handle() == nullptr);
         CHECK(&new_handle.raw_handle() == base_handle_ptr);
       }
 

From f73e7a1784b80a0d1584d5141e10f525497dd99c Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Mon, 24 Feb 2025 19:37:03 -0800
Subject: [PATCH 40/42] accessor, array_shape, copy_tensor_accessor,
 datatype_dispatch, allocator, and perf_metrics tests

---
 .envrc                                        |   3 +
 .proj.toml                                    |   1 +
 .vimrc                                        |   8 ++
 lib/kernels/include/kernels/accessor.h        |   9 +-
 .../include/kernels/copy_tensor_accessor.h    |  11 ++
 lib/kernels/src/accessor.cc                   |  73 +++++-----
 lib/kernels/src/array_shape.cc                |  38 ++++-
 lib/kernels/src/copy_tensor_accessor.cc       |  59 ++++++++
 lib/kernels/src/legion_dim.cc                 |   1 -
 lib/kernels/src/perf_metrics.cc               |   5 +-
 lib/kernels/test/src/test_accessor.cc         | 136 ++++++++++++++++++
 lib/kernels/test/src/test_array_shape.cc      | 105 ++++++++++++++
 lib/kernels/test/src/test_attention_kernel.cc |  10 +-
 .../test/src/test_batch_matmul_kernel.cc      |   6 +-
 .../test/src/test_batch_norm_kernel.cc        |   8 +-
 lib/kernels/test/src/test_cast_kernel.cc      |  10 +-
 lib/kernels/test/src/test_combine_kernel.cc   |  10 +-
 lib/kernels/test/src/test_concat_kernel.cc    |  12 +-
 .../test/src/test_copy_tensor_accessor.cc     |  76 ++++++++++
 .../test/src/test_datatype_dispatch.cc        |  65 +++++++++
 lib/kernels/test/src/test_dropout.cc          |   2 +-
 lib/kernels/test/src/test_flat_kernel.cc      |   2 +-
 lib/kernels/test/src/test_gather_kernels.cc   |  18 +--
 .../test/src/test_layer_norm_kernels.cc       |   4 +-
 lib/kernels/test/src/test_legion_dim.cc       |  29 ++++
 .../test/src/test_local_cpu_allocator.cc      |  19 +++
 .../test/src/test_local_cuda_allocator.cc     |  19 +++
 .../test/src/test_managed_ff_stream.cc        |  18 +--
 lib/kernels/test/src/test_partition_kernel.cc |   2 +-
 lib/kernels/test/src/test_perf_metrics.cc     | 127 ++++++++++++++++
 lib/kernels/test/src/test_pool_2d_kernels.cc  |   4 +-
 lib/kernels/test/src/test_reduction_kernel.cc |   4 +-
 lib/kernels/test/src/test_replicate_kernel.cc |  12 +-
 lib/kernels/test/src/test_reshape_kernel.cc   |   2 +-
 lib/kernels/test/src/test_reverse_kernels.cc  |   8 +-
 lib/kernels/test/src/test_softmax_kernel.cc   |   2 +-
 lib/kernels/test/src/test_split_kernel.cc     |   4 +-
 lib/kernels/test/src/test_transpose_kernel.cc |   2 +-
 lib/kernels/test/src/test_utils.cc            |  63 +-------
 lib/kernels/test/src/test_utils.h             |  16 +--
 40 files changed, 815 insertions(+), 188 deletions(-)
 create mode 100644 .envrc
 create mode 100644 .vimrc
 create mode 100644 lib/kernels/test/src/test_accessor.cc
 create mode 100644 lib/kernels/test/src/test_array_shape.cc
 create mode 100644 lib/kernels/test/src/test_copy_tensor_accessor.cc
 create mode 100644 lib/kernels/test/src/test_datatype_dispatch.cc
 create mode 100644 lib/kernels/test/src/test_legion_dim.cc
 create mode 100644 lib/kernels/test/src/test_local_cpu_allocator.cc
 create mode 100644 lib/kernels/test/src/test_local_cuda_allocator.cc
 create mode 100644 lib/kernels/test/src/test_perf_metrics.cc

diff --git a/.envrc b/.envrc
new file mode 100644
index 0000000000..2797f0f929
--- /dev/null
+++ b/.envrc
@@ -0,0 +1,3 @@
+source_up_if_exists
+
+use flake
diff --git a/.proj.toml b/.proj.toml
index 10307a6efa..b3b90bbada 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -15,6 +15,7 @@ build_targets = [
   "models",
   "export-model-arch",
   "substitution-to-dot",
+  "kernels-tests",
 ]
 
 test_targets = [
diff --git a/.vimrc b/.vimrc
new file mode 100644
index 0000000000..4c8a8a8279
--- /dev/null
+++ b/.vimrc
@@ -0,0 +1,8 @@
+" example search path configuration
+set path=lib/runtime/**,lib/**
+
+" set build target
+" let g:target = "pcg"
+
+" set test target
+" let g:test_target = "utils-test"
diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index 52ca62e217..8bbcf3ef95 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -267,6 +267,12 @@ std::vector<real_type_t<DT> const *>
   return out;
 }
 
+bool accessor_data_is_equal(GenericTensorAccessorR const &accessor_a,
+                            GenericTensorAccessorR const &accessor_b);
+
+bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
+                         GenericTensorAccessorR const &accessor_b);
+
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &write_accessor);
 
@@ -280,9 +286,6 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
 std::pair<ArrayShape, DataType>
     get_shape_and_datatype(GenericTensorAccessorR const &accessor);
 
-void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
-                                    GenericTensorAccessorR const &src_accessor);
-
 } // namespace FlexFlow
 
 namespace FlexFlow {
diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h
index da8af71e4f..97b6254750 100644
--- a/lib/kernels/include/kernels/copy_tensor_accessor.h
+++ b/lib/kernels/include/kernels/copy_tensor_accessor.h
@@ -6,6 +6,9 @@
 
 namespace FlexFlow {
 
+void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
+                                    GenericTensorAccessorR const &src_accessor);
+
 GenericTensorAccessorR
     copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
                            Allocator &allocator);
@@ -14,6 +17,14 @@ GenericTensorAccessorW
     copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
                            Allocator &allocator);
 
+GenericTensorAccessorW
+    copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor,
+                                        Allocator &allocator);
+
+GenericTensorAccessorR
+    copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor,
+                                        Allocator &allocator);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
index 1a0abec1c5..43f57717f8 100644
--- a/lib/kernels/src/accessor.cc
+++ b/lib/kernels/src/accessor.cc
@@ -1,38 +1,45 @@
 #include "kernels/accessor.h"
-#include "kernels/allocation.h"
+#include "kernels/copy_tensor_accessor.h"
 #include "kernels/datatype_dispatch.h"
+#include "kernels/local_cpu_allocator.h"
+#include <cstring>
+#include <iostream>
 
 namespace FlexFlow {
 
-void copy_accessor_data_to_l_from_r(
-    GenericTensorAccessorW &dst_accessor,
-    GenericTensorAccessorR const &src_accessor) {
-  size_t num_bytes =
-      dst_accessor.shape.get_volume().unwrap_nonnegative() *
-      size_of_datatype(dst_accessor.data_type).unwrap_nonnegative();
-
-  DeviceType dst_device_type = dst_accessor.device_type;
-  DeviceType src_device_type = src_accessor.device_type;
-
-  if (src_device_type == DeviceType::CPU &&
-      dst_device_type == DeviceType::CPU) {
-    memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes);
-  } else if (src_device_type == DeviceType::CPU &&
-             dst_device_type == DeviceType::GPU) {
-    checkCUDA(cudaMemcpy(
-        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice));
-  } else if (src_device_type == DeviceType::GPU &&
-             dst_device_type == DeviceType::CPU) {
-    checkCUDA(cudaMemcpy(
-        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost));
-  } else {
-    assert(src_device_type == DeviceType::GPU);
-    assert(dst_device_type == DeviceType::GPU);
-    checkCUDA(cudaMemcpy(dst_accessor.ptr,
-                         src_accessor.ptr,
-                         num_bytes,
-                         cudaMemcpyDeviceToDevice));
+template <DataType DT>
+struct AccessorDataIsEqual {
+  bool operator()(GenericTensorAccessorR const &a,
+                  GenericTensorAccessorR const &b) {
+    int const num_elements = a.shape.num_elements().unwrap_nonnegative();
+    if (num_elements != b.shape.num_elements().unwrap_nonnegative()) {
+      return false;
+    }
+
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    auto cpu_a = copy_accessor_r_to_cpu_if_necessary(a, cpu_allocator);
+    auto cpu_b = copy_accessor_r_to_cpu_if_necessary(b, cpu_allocator);
+
+    using T = real_type_t<DT>;
+    T const *a_ptr = cpu_a.get<DT>();
+    T const *b_ptr = cpu_b.get<DT>();
+
+    return std::equal(a_ptr, a_ptr + num_elements, b_ptr);
   }
+};
+
+bool accessor_data_is_equal(GenericTensorAccessorR const &accessor_a,
+                            GenericTensorAccessorR const &accessor_b) {
+  return DataTypeDispatch1<AccessorDataIsEqual>{}(
+      accessor_a.data_type, accessor_a, accessor_b);
+}
+
+bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
+                         GenericTensorAccessorR const &accessor_b) {
+  return accessor_a.data_type == accessor_b.data_type &&
+         accessor_a.device_type == accessor_b.device_type &&
+         accessor_a.shape == accessor_b.shape &&
+         accessor_data_is_equal(accessor_a, accessor_b);
 }
 
 GenericTensorAccessorW::operator GenericTensorAccessorR() const {
@@ -56,12 +63,12 @@ std::tuple<DataType const &,
 
 bool GenericTensorAccessorW::operator==(
     GenericTensorAccessorW const &other) const {
-  return this->tie() == other.tie();
+  return accessors_are_equal(*this, other);
 }
 
 bool GenericTensorAccessorW::operator!=(
     GenericTensorAccessorW const &other) const {
-  return this->tie() != other.tie();
+  return !(accessors_are_equal(*this, other));
 }
 
 int32_t *GenericTensorAccessorW::get_int32_ptr() const {
@@ -112,12 +119,12 @@ std::tuple<DataType const &,
 
 bool GenericTensorAccessorR::operator==(
     GenericTensorAccessorR const &other) const {
-  return this->tie() == other.tie();
+  return accessors_are_equal(*this, other);
 }
 
 bool GenericTensorAccessorR::operator!=(
     GenericTensorAccessorR const &other) const {
-  return this->tie() != other.tie();
+  return !(accessors_are_equal(*this, other));
 }
 
 int32_t const *GenericTensorAccessorR::get_int32_ptr() const {
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index 30db65cc03..499aebad86 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -51,18 +51,40 @@ nonnegative_int ArrayShape::at(ff_dim_t idx) const {
   return dims.at(legion_dim_from_ff_dim(idx, this->num_dims()));
 }
 
+legion_dim_t ArrayShape::last_idx() const {
+  if (this->dims.size() == 0) {
+    throw mk_runtime_error("Cannot get last index of an empty shape");
+  }
+  return legion_dim_t(nonnegative_int{this->dims.size() - 1});
+}
+
+legion_dim_t ArrayShape::neg_idx(int idx) const {
+  if (std::abs(idx) > this->dims.size()) {
+    throw mk_runtime_error(
+        fmt::format("Invalid negative index: {} (shape has {} dimensions)",
+                    idx,
+                    this->dims.size()));
+  }
+
+  if (idx >= 0) {
+    throw mk_runtime_error(fmt::format(
+        "Idx should be negative for negative indexing, got {}", idx));
+  }
+
+  return legion_dim_t(nonnegative_int{this->dims.size() + idx});
+}
+
 bool ArrayShape::operator==(ArrayShape const &other) const {
-  return this->tie() == other.tie();
+  return this->dims == other.dims;
 }
 
 bool ArrayShape::operator!=(ArrayShape const &other) const {
-  return this->tie() != other.tie();
+  return !(this->dims == other.dims);
 }
 
 ArrayShape ArrayShape::sub_shape(
     std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
     std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
-
   nonnegative_int num_dims = this->num_dims();
 
   auto to_legion_index = [num_dims](auto arg) -> nonnegative_int {
@@ -85,7 +107,9 @@ ArrayShape ArrayShape::sub_shape(
         "Invalid sub_shape range: start={}, end={}", start_idx, end_idx));
   }
 
-  return ArrayShape(&this->dims[legion_dim_t{start_idx}], end_idx - start_idx);
+  return ArrayShape(std::vector<nonnegative_int>(
+      this->dims.begin() + start_idx.unwrap_nonnegative(),
+      this->dims.begin() + end_idx.unwrap_nonnegative()));
 }
 
 std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
@@ -97,7 +121,11 @@ std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
 }
 
 std::optional<nonnegative_int> ArrayShape::at_maybe(ff_dim_t index) const {
-  return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims()));
+  if (index.value < this->num_dims()) {
+    return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims()));
+  } else {
+    return std::nullopt;
+  }
 }
 
 std::tuple<LegionOrdered<nonnegative_int> const &> ArrayShape::tie() const {
diff --git a/lib/kernels/src/copy_tensor_accessor.cc b/lib/kernels/src/copy_tensor_accessor.cc
index 6a3ad8033a..cc033223f8 100644
--- a/lib/kernels/src/copy_tensor_accessor.cc
+++ b/lib/kernels/src/copy_tensor_accessor.cc
@@ -3,6 +3,37 @@
 
 namespace FlexFlow {
 
+void copy_accessor_data_to_l_from_r(
+    GenericTensorAccessorW &dst_accessor,
+    GenericTensorAccessorR const &src_accessor) {
+  size_t num_bytes =
+      dst_accessor.shape.get_volume().unwrap_nonnegative() *
+      size_of_datatype(dst_accessor.data_type).unwrap_nonnegative();
+
+  DeviceType dst_device_type = dst_accessor.device_type;
+  DeviceType src_device_type = src_accessor.device_type;
+
+  if (src_device_type == DeviceType::CPU &&
+      dst_device_type == DeviceType::CPU) {
+    memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes);
+  } else if (src_device_type == DeviceType::CPU &&
+             dst_device_type == DeviceType::GPU) {
+    checkCUDA(cudaMemcpy(
+        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice));
+  } else if (src_device_type == DeviceType::GPU &&
+             dst_device_type == DeviceType::CPU) {
+    checkCUDA(cudaMemcpy(
+        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost));
+  } else {
+    assert(src_device_type == DeviceType::GPU);
+    assert(dst_device_type == DeviceType::GPU);
+    checkCUDA(cudaMemcpy(dst_accessor.ptr,
+                         src_accessor.ptr,
+                         num_bytes,
+                         cudaMemcpyDeviceToDevice));
+  }
+}
+
 template <DataType DT>
 struct CopyTensorAccessorW {
   GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor,
@@ -45,4 +76,32 @@ GenericTensorAccessorR
       src_accessor.data_type, src_accessor, allocator);
 }
 
+GenericTensorAccessorR
+    copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor,
+                                        Allocator &cpu_allocator) {
+  if (cpu_allocator.get_allocation_device_type() == DeviceType::GPU) {
+    throw mk_runtime_error("Allocator must be a CPU allocator");
+  }
+
+  GenericTensorAccessorR cpu_accessor = accessor;
+  if (accessor.device_type == DeviceType::GPU) {
+    cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator);
+  }
+  return cpu_accessor;
+}
+
+GenericTensorAccessorW
+    copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor,
+                                        Allocator &cpu_allocator) {
+  if (cpu_allocator.get_allocation_device_type() == DeviceType::GPU) {
+    throw mk_runtime_error("Allocator must be a CPU allocator");
+  }
+
+  GenericTensorAccessorW cpu_accessor = accessor;
+  if (accessor.device_type == DeviceType::GPU) {
+    cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator);
+  }
+  return cpu_accessor;
+}
+
 } // namespace FlexFlow
diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc
index 4e7fc56848..14016a6202 100644
--- a/lib/kernels/src/legion_dim.cc
+++ b/lib/kernels/src/legion_dim.cc
@@ -10,7 +10,6 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) {
 legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim,
                                     nonnegative_int num_dimensions) {
   return legion_dim_t{num_dimensions - ff_dim.value - 1_n};
-  ;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/perf_metrics.cc b/lib/kernels/src/perf_metrics.cc
index 2036ddd35a..ab0e113a26 100644
--- a/lib/kernels/src/perf_metrics.cc
+++ b/lib/kernels/src/perf_metrics.cc
@@ -15,8 +15,9 @@ PerfMetrics::PerfMetrics(int _train_all,
                          double _start_time_micro,
                          double _current_time_micro)
     : train_all(_train_all), train_correct(_train_correct), cce_loss(_cce_loss),
-      mse_loss(_mse_loss), rmse_loss(_rmse_loss), mae_loss(_mae_loss),
-      start_time(_start_time_micro), current_time(_current_time_micro) {}
+      sparse_cce_loss(_sparse_cce_loss), mse_loss(_mse_loss), 
+      rmse_loss(_rmse_loss), mae_loss(_mae_loss), start_time(_start_time_micro), 
+      current_time(_current_time_micro) {}
 
 float get_throughput(PerfMetrics const &m) {
   return m.train_all / (m.current_time - m.start_time);
diff --git a/lib/kernels/test/src/test_accessor.cc b/lib/kernels/test/src/test_accessor.cc
new file mode 100644
index 0000000000..e9611a928c
--- /dev/null
+++ b/lib/kernels/test/src/test_accessor.cc
@@ -0,0 +1,136 @@
+#include "doctest/doctest.h"
+#include "kernels/accessor.h"
+#include "op-attrs/datatype_value.h"
+#include "test_utils.h"
+
+using namespace ::FlexFlow;
+
+template <DataType DT>
+void check_accessor_get(GenericTensorAccessorR const &accessor,
+                        real_type_t<DT> expected) {
+  CHECK(*accessor.get<DT>() == expected);
+
+  if constexpr (DT == DataType::INT32) {
+    CHECK(*accessor.get_int32_ptr() == expected);
+  } else if constexpr (DT == DataType::INT64) {
+    CHECK(*accessor.get_int64_ptr() == expected);
+  } else if constexpr (DT == DataType::FLOAT) {
+    CHECK(*accessor.get_float_ptr() == doctest::Approx(expected));
+  } else if constexpr (DT == DataType::DOUBLE) {
+    CHECK(*accessor.get_double_ptr() == doctest::Approx(expected));
+  } else if constexpr (DT == DataType::HALF) {
+    CHECK(*accessor.get_half_ptr() == doctest::Approx(expected));
+  }
+}
+
+template <DataType DT>
+void run_accessor_w_test(DataTypeValue value,
+                         real_type_t<DT> expected,
+                         Allocator allocator) {
+  TensorShape shape = make_tensor_shape_from_ff_ordered({1_n}, DT);
+  GenericTensorAccessorW accessor =
+      create_filled_accessor_w(shape, allocator, value);
+  check_accessor_get<DT>(read_only_accessor_from_write_accessor(accessor),
+                         expected);
+}
+
+template <DataType DT>
+void run_accessor_r_test(DataTypeValue value,
+                         real_type_t<DT> expected,
+                         Allocator allocator) {
+  TensorShape shape = make_tensor_shape_from_ff_ordered({1_n}, DT);
+  GenericTensorAccessorR accessor =
+      create_filled_accessor_r(shape, allocator, value);
+  check_accessor_get<DT>(accessor, expected);
+}
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Test GenericTensorAccessors") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("Test GenericTensorAccessorW") {
+      SUBCASE("Test get methods for GenericTensorAccessorW") {
+        run_accessor_w_test<DataType::INT32>(
+            make_int32_data_type_value(12345), 12345, cpu_allocator);
+        run_accessor_w_test<DataType::INT64>(
+            make_int64_data_type_value(12345LL), 12345LL, cpu_allocator);
+        run_accessor_w_test<DataType::FLOAT>(
+            make_float_data_type_value(1.23f), 1.23f, cpu_allocator);
+        run_accessor_w_test<DataType::DOUBLE>(
+            make_double_data_type_value(1.23), 1.23, cpu_allocator);
+      }
+
+      SUBCASE("Test operator== and operator!= for GenericTensorAccessorW") {
+        TensorShape shape =
+            make_tensor_shape_from_ff_ordered({1_n}, DataType::INT32);
+
+        GenericTensorAccessorW accessor1 = create_filled_accessor_w(
+            shape, cpu_allocator, make_int32_data_type_value(12345));
+        GenericTensorAccessorW accessor2 = create_filled_accessor_w(
+            shape, cpu_allocator, make_int32_data_type_value(12345));
+        GenericTensorAccessorW accessor3 = create_filled_accessor_w(
+            shape, cpu_allocator, make_int32_data_type_value(54321));
+
+        CHECK(accessor1 == accessor2);
+        CHECK(accessor1 != accessor3);
+      }
+
+      SUBCASE("Test at() method for GenericTensorAccessorW") {
+        DataType const DT = DataType::INT32;
+        TensorShape shape = make_tensor_shape_from_ff_ordered({3_n, 3_n}, DT);
+
+        GenericTensorAccessorW accessor_1 =
+            create_random_filled_accessor_w(shape, cpu_allocator);
+        GenericTensorAccessorW accessor_2 =
+            copy_tensor_accessor_w(accessor_1, cpu_allocator);
+
+        CHECK(accessor_1.at<DT>({0, 0}) == accessor_2.at<DT>({0, 0}));
+        CHECK(accessor_1.at<DT>({1, 0}) == accessor_2.at<DT>({1, 0}));
+        CHECK(accessor_1.at<DT>({2, 2}) == accessor_2.at<DT>({2, 2}));
+      }
+    }
+
+    SUBCASE("Test GenericTensorAccessorR") {
+
+      SUBCASE("Test get methods for GenericTensorAccessorR") {
+        run_accessor_r_test<DataType::INT32>(
+            make_int32_data_type_value(12345), 12345, cpu_allocator);
+        run_accessor_r_test<DataType::INT64>(
+            make_int64_data_type_value(12345LL), 12345LL, cpu_allocator);
+        run_accessor_r_test<DataType::FLOAT>(
+            make_float_data_type_value(1.23f), 1.23f, cpu_allocator);
+        run_accessor_r_test<DataType::DOUBLE>(
+            make_double_data_type_value(1.23), 1.23, cpu_allocator);
+      }
+
+      SUBCASE("Test operator== and operator!= for GenericTensorAccessorR") {
+        TensorShape shape =
+            make_tensor_shape_from_ff_ordered({1_n}, DataType::INT32);
+
+        GenericTensorAccessorR accessor1 = create_filled_accessor_r(
+            shape, cpu_allocator, make_int32_data_type_value(12345));
+        GenericTensorAccessorR accessor2 = create_filled_accessor_r(
+            shape, cpu_allocator, make_int32_data_type_value(12345));
+        GenericTensorAccessorR accessor3 = create_filled_accessor_r(
+            shape, cpu_allocator, make_int32_data_type_value(54321));
+
+        CHECK(accessor1 == accessor2);
+        CHECK(accessor1 != accessor3);
+      }
+
+      SUBCASE("Test at() method for GenericTensorAccessorR") {
+        DataType const DT = DataType::INT32;
+        TensorShape shape = make_tensor_shape_from_ff_ordered({3_n, 3_n}, DT);
+
+        GenericTensorAccessorR accessor_1 =
+            create_random_filled_accessor_r(shape, cpu_allocator);
+        GenericTensorAccessorR accessor_2 =
+            copy_tensor_accessor_r(accessor_1, cpu_allocator);
+
+        CHECK(accessor_1.at<DT>({0, 0}) == accessor_2.at<DT>({0, 0}));
+        CHECK(accessor_1.at<DT>({1, 0}) == accessor_2.at<DT>({1, 0}));
+        CHECK(accessor_1.at<DT>({2, 2}) == accessor_2.at<DT>({2, 2}));
+      }
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_array_shape.cc b/lib/kernels/test/src/test_array_shape.cc
new file mode 100644
index 0000000000..7ede1791ef
--- /dev/null
+++ b/lib/kernels/test/src/test_array_shape.cc
@@ -0,0 +1,105 @@
+#include "doctest/doctest.h"
+#include "kernels/array_shape.h"
+#include "test_utils.h"
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Test ArrayShape") {
+    ArrayShape shape({1_n, 2_n, 3_n, 4_n});
+
+    SUBCASE("Test get_volume() and num_elements()") {
+      CHECK(shape.get_volume() == 1 * 2 * 3 * 4);
+      CHECK(shape.num_elements() == 1 * 2 * 3 * 4);
+    }
+
+    SUBCASE("Test num_dims() and get_dim()") {
+      CHECK(shape.num_dims() == 4);
+      CHECK(shape.get_dim() == 4);
+    }
+
+    SUBCASE("Test operator[] and at()") {
+      CHECK(shape[legion_dim_t{0_n}] == 1);
+      CHECK(shape[legion_dim_t{1_n}] == 2);
+      CHECK(shape[legion_dim_t{2_n}] == 3);
+      CHECK(shape[legion_dim_t{3_n}] == 4);
+
+      CHECK(shape.at(legion_dim_t{0_n}) == 1);
+      CHECK(shape.at(legion_dim_t{1_n}) == 2);
+      CHECK(shape.at(legion_dim_t{2_n}) == 3);
+      CHECK(shape.at(legion_dim_t{3_n}) == 4);
+
+      CHECK(shape.at(ff_dim_t{0_n}) == 4);
+      CHECK(shape.at(ff_dim_t{1_n}) == 3);
+      CHECK(shape.at(ff_dim_t{2_n}) == 2);
+      CHECK(shape.at(ff_dim_t{3_n}) == 1);
+    }
+
+    SUBCASE("Test operator== and operator!=") {
+      ArrayShape shape2({1_n, 2_n, 3_n, 4_n});
+      ArrayShape shape3({1_n, 2_n, 3_n, 5_n});
+
+      CHECK(shape == shape2);
+      CHECK(shape != shape3);
+    }
+
+    SUBCASE("Test last_idx()") {
+      CHECK(shape.last_idx() == legion_dim_t{3_n});
+
+      ArrayShape empty_shape(std::vector<nonnegative_int>{});
+      CHECK_THROWS(empty_shape.last_idx());
+    }
+
+    SUBCASE("Test neg_idx()") {
+      CHECK(shape.neg_idx(-1) == legion_dim_t{3_n});
+      CHECK(shape.neg_idx(-2) == legion_dim_t{2_n});
+      CHECK(shape.neg_idx(-3) == legion_dim_t{1_n});
+      CHECK(shape.neg_idx(-4) == legion_dim_t{0_n});
+
+      CHECK_THROWS(shape.neg_idx(-5));
+    }
+
+    SUBCASE("Test at_maybe()") {
+      CHECK(shape.at_maybe(legion_dim_t{0_n}).value() == 1);
+      CHECK(shape.at_maybe(legion_dim_t{1_n}).value() == 2);
+      CHECK(shape.at_maybe(legion_dim_t{2_n}).value() == 3);
+      CHECK(shape.at_maybe(legion_dim_t{3_n}).value() == 4);
+      CHECK(!shape.at_maybe(legion_dim_t{4_n}).has_value());
+
+      CHECK(shape.at_maybe(ff_dim_t{0_n}).value() == 4);
+      CHECK(shape.at_maybe(ff_dim_t{1_n}).value() == 3);
+      CHECK(shape.at_maybe(ff_dim_t{2_n}).value() == 2);
+      CHECK(shape.at_maybe(ff_dim_t{3_n}).value() == 1);
+      CHECK(!shape.at_maybe(ff_dim_t{4_n}).has_value());
+    }
+
+    SUBCASE("Test subshape()") {
+      SUBCASE("Test basic subshape") {
+        ArrayShape ref_shape({2_n, 3_n});
+        ArrayShape subshape =
+            shape.sub_shape(legion_dim_t{1_n}, legion_dim_t{3_n});
+
+        CHECK(ref_shape == subshape);
+      }
+
+      SUBCASE("Test empty subshape") {
+        ArrayShape ref_shape(std::vector<nonnegative_int>{});
+        ArrayShape subshape =
+            shape.sub_shape(legion_dim_t{0_n}, legion_dim_t{0_n});
+        CHECK(ref_shape == subshape);
+      }
+
+      SUBCASE("Test subshape with no start") {
+        ArrayShape ref_shape({1_n, 2_n, 3_n});
+        ArrayShape subshape = shape.sub_shape(std::nullopt, legion_dim_t{3_n});
+        CHECK(ref_shape == subshape);
+      }
+
+      SUBCASE("Test subshape with no end") {
+        ArrayShape ref_shape({2_n, 3_n, 4_n});
+        ArrayShape subshape = shape.sub_shape(legion_dim_t{1_n}, std::nullopt);
+        CHECK(ref_shape == subshape);
+      }
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index bd0167a677..6b54554a9b 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -41,15 +41,15 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*kvSeqLength=*/kvSeqLength.unwrap_nonnegative(),
         /*add_bias_kv=*/false);
 
-    TensorShape query_shape = make_tensor_shape_from_legion_dims(
+    TensorShape query_shape = make_tensor_shape_from_ff_ordered(
         {qoSeqLength, num_samples, qSize}, DataType::FLOAT);
-    TensorShape key_shape = make_tensor_shape_from_legion_dims(
+    TensorShape key_shape = make_tensor_shape_from_ff_ordered(
         {kvSeqLength, num_samples, kSize}, DataType::FLOAT);
-    TensorShape value_shape = make_tensor_shape_from_legion_dims(
+    TensorShape value_shape = make_tensor_shape_from_ff_ordered(
         {kvSeqLength, num_samples, vSize}, DataType::FLOAT);
-    TensorShape output_shape = make_tensor_shape_from_legion_dims(
+    TensorShape output_shape = make_tensor_shape_from_ff_ordered(
         {qoSeqLength, num_samples, oProjSize}, DataType::FLOAT);
-    TensorShape weight_shape = make_tensor_shape_from_legion_dims(
+    TensorShape weight_shape = make_tensor_shape_from_ff_ordered(
         {nonnegative_int{state.weightSize}}, DataType::FLOAT);
 
     GenericTensorAccessorW query_accessor =
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index d78d5daee5..ba9b3ac0e2 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -22,11 +22,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape_a =
-        make_tensor_shape_from_legion_dims({m, k, batch}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({m, k, batch}, DataType::FLOAT);
     TensorShape input_shape_b =
-        make_tensor_shape_from_legion_dims({k, n, batch}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({k, n, batch}, DataType::FLOAT);
     TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({m, n, batch}, DataType::FLOAT);
 
     GenericTensorAccessorW a_accessor =
         create_random_filled_accessor_w(input_shape_a, allocator);
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index d0ec2559ba..698a320a69 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -29,13 +29,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*output_w=*/output_w.unwrap_nonnegative(),
         /*relu=*/true);
 
-    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+    TensorShape input_shape = make_tensor_shape_from_ff_ordered(
         {output_n, output_c, output_h, output_w}, DataType::FLOAT);
-    TensorShape output_shape = make_tensor_shape_from_legion_dims(
+    TensorShape output_shape = make_tensor_shape_from_ff_ordered(
         {output_n, output_c, output_h, output_w}, DataType::FLOAT);
-    TensorShape scale_shape = make_tensor_shape_from_legion_dims(
+    TensorShape scale_shape = make_tensor_shape_from_ff_ordered(
         {output_n, output_c, output_h, output_w}, DataType::FLOAT);
-    TensorShape bias_shape = make_tensor_shape_from_legion_dims(
+    TensorShape bias_shape = make_tensor_shape_from_ff_ordered(
         {output_n, output_c, output_h, output_w}, DataType::FLOAT);
 
     GenericTensorAccessorW input_accessor =
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index c59d8eae3f..d314a6bcc2 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -11,9 +11,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100_n, 100_n}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({100_n, 100_n}, DataType::FLOAT);
     TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({100_n, 100_n}, DataType::DOUBLE);
+        make_tensor_shape_from_ff_ordered({100_n, 100_n}, DataType::DOUBLE);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
@@ -48,9 +48,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({10_n, 2_n}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({10_n, 2_n}, DataType::FLOAT);
     TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({10_n, 2_n}, DataType::DOUBLE);
+        make_tensor_shape_from_ff_ordered({10_n, 2_n}, DataType::DOUBLE);
 
     // Only calling forward kernel as backward kernel is exactly the same
     SUBCASE("forward_kernel") {
@@ -72,7 +72,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::Cast::cpu_forward_kernel(input_accessor_cpu,
                                         output_accessor_cpu);
 
-      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
+      CHECK(accessor_data_is_equal(output_accessor_gpu, output_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index 97fa81920b..b30d1ab7f4 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100_n, 100_n}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({100_n, 100_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
@@ -50,7 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({5_n, 5_n}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({5_n, 5_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
@@ -72,7 +72,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::Combine::cpu_forward_kernel(input_accessor_cpu,
                                            output_accessor_cpu);
 
-      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
+      CHECK(accessor_data_is_equal(output_accessor_gpu, output_accessor_cpu));
     }
 
     SUBCASE("backward_kernel") {
@@ -95,8 +95,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu,
                                             input_grad_accessor_cpu);
 
-      CHECK(accessors_are_equal(input_grad_accessor_gpu,
-                                input_grad_accessor_cpu));
+      CHECK(accessor_data_is_equal(input_grad_accessor_gpu, 
+                                   input_grad_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index 22da72912a..f8bc31c3d5 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -19,7 +19,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                   nonnegative_int input_cols,
                                   TensorShape output_shape,
                                   ff_dim_t concat_axis) {
-        TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        TensorShape input_shape = make_tensor_shape_from_ff_ordered(
             {input_rows, input_cols}, DataType::FLOAT);
 
         std::vector<GenericTensorAccessorR> input_accessors =
@@ -41,7 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("test forward concat, axis = 0") {
         nonnegative_int input_rows = 2_n;
         nonnegative_int input_cols = 4_n;
-        TensorShape output_shape = make_tensor_shape_from_legion_dims(
+        TensorShape output_shape = make_tensor_shape_from_ff_ordered(
             {num_inputs * input_rows, input_cols}, DataType::FLOAT);
         run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n});
       }
@@ -49,7 +49,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("test forward concat, axis = 1") {
         nonnegative_int input_rows = 4_n;
         nonnegative_int input_cols = 2_n;
-        TensorShape output_shape = make_tensor_shape_from_legion_dims(
+        TensorShape output_shape = make_tensor_shape_from_ff_ordered(
             {input_rows, num_inputs * input_cols}, DataType::FLOAT);
         run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n});
       }
@@ -60,7 +60,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                    nonnegative_int input_cols,
                                    TensorShape output_shape,
                                    ff_dim_t concat_axis) {
-        TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        TensorShape input_shape = make_tensor_shape_from_ff_ordered(
             {input_rows, input_cols}, DataType::FLOAT);
 
         GenericTensorAccessorR output_grad_accessor =
@@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("test backward concat, axis = 0") {
         nonnegative_int input_rows = 2_n;
         nonnegative_int input_cols = 4_n;
-        TensorShape output_shape = make_tensor_shape_from_legion_dims(
+        TensorShape output_shape = make_tensor_shape_from_ff_ordered(
             {num_inputs * input_rows, input_cols}, DataType::FLOAT);
         run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n});
       }
@@ -92,7 +92,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("test backward concat, axis = 1") {
         nonnegative_int input_rows = 4_n;
         nonnegative_int input_cols = 2_n;
-        TensorShape output_shape = make_tensor_shape_from_legion_dims(
+        TensorShape output_shape = make_tensor_shape_from_ff_ordered(
             {input_rows, num_inputs * input_cols}, DataType::FLOAT);
         run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n});
       }
diff --git a/lib/kernels/test/src/test_copy_tensor_accessor.cc b/lib/kernels/test/src/test_copy_tensor_accessor.cc
new file mode 100644
index 0000000000..a6a4cfde53
--- /dev/null
+++ b/lib/kernels/test/src/test_copy_tensor_accessor.cc
@@ -0,0 +1,76 @@
+#include "doctest/doctest.h"
+#include "kernels/accessor.h"
+#include "op-attrs/datatype_value.h"
+#include "test_utils.h"
+
+using namespace ::FlexFlow;
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Test copy_tensor_accessor") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    TensorShape shape =
+        make_tensor_shape_from_ff_ordered({5_n, 5_n}, DataType::FLOAT);
+
+    SUBCASE("Test copy_tensor_accessor_r") {
+      GenericTensorAccessorR src_accessor =
+          create_random_filled_accessor_r(shape, cpu_allocator);
+      GenericTensorAccessorR dst_accessor =
+          copy_tensor_accessor_r(src_accessor, cpu_allocator);
+
+      CHECK(accessor_data_is_equal(src_accessor, dst_accessor));
+    }
+
+    SUBCASE("Test copy_tensor_accessor_w") {
+      GenericTensorAccessorW src_accessor =
+          create_random_filled_accessor_w(shape, cpu_allocator);
+      GenericTensorAccessorW dst_accessor =
+          copy_tensor_accessor_w(src_accessor, cpu_allocator);
+
+      CHECK(accessor_data_is_equal(src_accessor, dst_accessor));
+    }
+
+    SUBCASE("Test copy_accessor_r_to_cpu_if_necessary") {
+      SUBCASE("Test necessary") {
+        GenericTensorAccessorR src_accessor =
+            create_random_filled_accessor_r(shape, gpu_allocator);
+        GenericTensorAccessorR dst_accessor =
+            copy_accessor_r_to_cpu_if_necessary(src_accessor, cpu_allocator);
+
+        CHECK(accessor_data_is_equal(src_accessor, dst_accessor));
+        CHECK(dst_accessor.device_type == DeviceType::CPU);
+      }
+
+      SUBCASE("Test not necessary") {
+        GenericTensorAccessorR src_accessor =
+            create_random_filled_accessor_r(shape, cpu_allocator);
+        GenericTensorAccessorR dst_accessor =
+            copy_accessor_r_to_cpu_if_necessary(src_accessor, cpu_allocator);
+
+        CHECK(accessor_data_is_equal(src_accessor, dst_accessor));
+        CHECK(dst_accessor.device_type == DeviceType::CPU);
+      }
+    }
+
+    SUBCASE("Test copy_accessor_w_to_cpu_if_necessary") {
+      SUBCASE("Test necessary") {
+        GenericTensorAccessorW src_accessor =
+            create_random_filled_accessor_w(shape, gpu_allocator);
+        GenericTensorAccessorW dst_accessor =
+            copy_accessor_w_to_cpu_if_necessary(src_accessor, cpu_allocator);
+
+        CHECK(accessor_data_is_equal(src_accessor, dst_accessor));
+        CHECK(dst_accessor.device_type == DeviceType::CPU);
+      }
+
+      SUBCASE("Test not necessary") {
+        GenericTensorAccessorW src_accessor =
+            create_random_filled_accessor_w(shape, cpu_allocator);
+        GenericTensorAccessorW dst_accessor =
+            copy_accessor_w_to_cpu_if_necessary(src_accessor, cpu_allocator);
+
+        CHECK(accessor_data_is_equal(src_accessor, dst_accessor));
+        CHECK(dst_accessor.device_type == DeviceType::CPU);
+      }
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_datatype_dispatch.cc b/lib/kernels/test/src/test_datatype_dispatch.cc
new file mode 100644
index 0000000000..41737d715a
--- /dev/null
+++ b/lib/kernels/test/src/test_datatype_dispatch.cc
@@ -0,0 +1,65 @@
+#include "doctest/doctest.h"
+#include "kernels/datatype_dispatch.h"
+
+using namespace ::FlexFlow;
+
+template <DataType DT>
+struct TestDatatypeDispatch1 {
+  int operator()(int value) {
+    if (DT == DataType::FLOAT) {
+      return value + 1;
+    } else if (DT == DataType::INT32) {
+      return value + 2;
+    } else {
+      return value + 3;
+    }
+  }
+};
+
+template <DataType IDT, DataType ODT>
+struct TestDatatypeDispatch2 {
+  void operator()(int &value) {
+    if (IDT == DataType::INT32 && ODT == DataType::FLOAT) {
+      value *= 2;
+    } else if (IDT == DataType::FLOAT && ODT == DataType::INT32) {
+      value *= 3;
+    } else {
+      value *= 4;
+    }
+  }
+};
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Test DataTypeDispatch") {
+    SUBCASE("Test DataTypeDispatch1") {
+      CHECK(DataTypeDispatch1<TestDatatypeDispatch1>{}(DataType::FLOAT, 1) ==
+            2);
+      CHECK(DataTypeDispatch1<TestDatatypeDispatch1>{}(DataType::INT32, 1) ==
+            3);
+      CHECK(DataTypeDispatch1<TestDatatypeDispatch1>{}(DataType::DOUBLE, 1) ==
+            4);
+    }
+
+    SUBCASE("Test DataTypeDispatch2") {
+      int value = 1;
+
+      SUBCASE("Case One") {
+        DataTypeDispatch2<TestDatatypeDispatch2>{}(
+            DataType::INT32, DataType::FLOAT, value);
+        CHECK(value == 2);
+      }
+
+      SUBCASE("Case Two") {
+        DataTypeDispatch2<TestDatatypeDispatch2>{}(
+            DataType::FLOAT, DataType::INT32, value);
+        CHECK(value == 3);
+      }
+
+      SUBCASE("Test Three") {
+        DataTypeDispatch2<TestDatatypeDispatch2>{}(
+            DataType::DOUBLE, DataType::DOUBLE, value);
+        CHECK(value == 4);
+      }
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 1a34c59be6..e5eba341f3 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({10_n, 10_n}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({10_n, 10_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     ManagedFFStream managed_stream{};
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index 238c4ac361..ee4554d00a 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     ManagedFFStream managed_stream{};
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     GenericTensorAccessorR input_accessor =
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 043617c790..64cc824b9b 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -37,21 +37,21 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       SUBCASE("test gather forward, 2D") {
         TensorShape input_shape =
-            make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT);
+            make_tensor_shape_from_ff_ordered({2_n, 100_n}, DataType::FLOAT);
         TensorShape index_shape =
-            make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::INT32);
+            make_tensor_shape_from_ff_ordered({2_n, 20_n}, DataType::INT32);
         TensorShape output_shape =
-            make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::FLOAT);
+            make_tensor_shape_from_ff_ordered({2_n, 20_n}, DataType::FLOAT);
         run_forward_test(input_shape, index_shape, output_shape);
       }
 
       SUBCASE("test gather forward, 1D") {
         TensorShape input_shape =
-            make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
+            make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT);
         TensorShape index_shape =
-            make_tensor_shape_from_legion_dims({10_n}, DataType::INT32);
+            make_tensor_shape_from_ff_ordered({10_n}, DataType::INT32);
         TensorShape output_shape =
-            make_tensor_shape_from_legion_dims({10_n}, DataType::FLOAT);
+            make_tensor_shape_from_ff_ordered({10_n}, DataType::FLOAT);
         run_forward_test(input_shape, index_shape, output_shape);
       }
     }
@@ -77,11 +77,11 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       SUBCASE("test gather backward, 2D") {
         TensorShape input_shape =
-            make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT);
+            make_tensor_shape_from_ff_ordered({2_n, 100_n}, DataType::FLOAT);
         TensorShape index_shape =
-            make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::INT32);
+            make_tensor_shape_from_ff_ordered({2_n, 25_n}, DataType::INT32);
         TensorShape output_shape =
-            make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::FLOAT);
+            make_tensor_shape_from_ff_ordered({2_n, 25_n}, DataType::FLOAT);
         run_backward_test(input_shape, index_shape, output_shape);
       }
     }
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 8368fe4efd..4d5802936e 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -12,11 +12,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     float epsilon = 1e-5f;
     bool elementwise_affine = true;
 
-    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+    TensorShape input_shape = make_tensor_shape_from_ff_ordered(
         {batch_size, feature_size}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
     TensorShape feature_shape =
-        make_tensor_shape_from_legion_dims({feature_size}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({feature_size}, DataType::FLOAT);
 
     ManagedPerDeviceFFHandle managed_handle{
         /*workSpaceSize=*/1024 * 1024,
diff --git a/lib/kernels/test/src/test_legion_dim.cc b/lib/kernels/test/src/test_legion_dim.cc
new file mode 100644
index 0000000000..c06b779ad8
--- /dev/null
+++ b/lib/kernels/test/src/test_legion_dim.cc
@@ -0,0 +1,29 @@
+#include "doctest/doctest.h"
+#include "kernels/legion_dim.h"
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Test LegionDim") {
+    SUBCASE("Test add_to_legion_dim") {
+      legion_dim_t dim{1_n};
+      CHECK(add_to_legion_dim(dim, 2) == legion_dim_t{3_n});
+    }
+
+    SUBCASE("Test legion_dim_from_ff_dim") {
+      CHECK(legion_dim_from_ff_dim(ff_dim_t{0_n}, 4_n) == legion_dim_t{3_n});
+      CHECK(legion_dim_from_ff_dim(ff_dim_t{1_n}, 4_n) == legion_dim_t{2_n});
+      CHECK(legion_dim_from_ff_dim(ff_dim_t{2_n}, 4_n) == legion_dim_t{1_n});
+      CHECK(legion_dim_from_ff_dim(ff_dim_t{3_n}, 4_n) == legion_dim_t{0_n});
+    }
+
+    SUBCASE("Test LegionOrdered") {
+      LegionOrdered<int> legion_ordered{1, 2, 3, 4};
+
+      SUBCASE("Test ff_ordered_from_legion_ordered") {
+        CHECK(ff_ordered_from_legion_ordered(legion_ordered) ==
+              FFOrdered<int>{4, 3, 2, 1});
+      }
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_local_cpu_allocator.cc b/lib/kernels/test/src/test_local_cpu_allocator.cc
new file mode 100644
index 0000000000..fa6bce36db
--- /dev/null
+++ b/lib/kernels/test/src/test_local_cpu_allocator.cc
@@ -0,0 +1,19 @@
+#include "kernels/local_cpu_allocator.h"
+#include "doctest/doctest.h"
+
+using namespace ::FlexFlow;
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Test LocalCPUAllocator") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("Test allocate and deallocate") {
+      void *ptr = cpu_allocator.allocate(100);
+      CHECK(ptr != nullptr);
+      cpu_allocator.deallocate(ptr);
+    }
+
+    SUBCASE("Test get_allocation_device_type") {
+      CHECK(cpu_allocator.get_allocation_device_type() == DeviceType::CPU);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_local_cuda_allocator.cc b/lib/kernels/test/src/test_local_cuda_allocator.cc
new file mode 100644
index 0000000000..c091576bd3
--- /dev/null
+++ b/lib/kernels/test/src/test_local_cuda_allocator.cc
@@ -0,0 +1,19 @@
+#include "kernels/local_cuda_allocator.h"
+#include "doctest/doctest.h"
+
+using namespace ::FlexFlow;
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Test LocalCUDAAllocator") {
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+
+    SUBCASE("Test allocate and deallocate") {
+      void *ptr = gpu_allocator.allocate(100);
+      CHECK(ptr != nullptr);
+      gpu_allocator.deallocate(ptr);
+    }
+
+    SUBCASE("Test get_allocation_device_type") {
+      CHECK(gpu_allocator.get_allocation_device_type() == DeviceType::GPU);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
index 87b564d284..841c9a82ab 100644
--- a/lib/kernels/test/src/test_managed_ff_stream.cc
+++ b/lib/kernels/test/src/test_managed_ff_stream.cc
@@ -37,21 +37,21 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       SUBCASE("test gather forward, 2D") {
         TensorShape input_shape =
-            make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT);
+            make_tensor_shape_from_ff_ordered({2_n, 100_n}, DataType::FLOAT);
         TensorShape index_shape =
-            make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::INT32);
+            make_tensor_shape_from_ff_ordered({2_n, 20_n}, DataType::INT32);
         TensorShape output_shape =
-            make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::FLOAT);
+            make_tensor_shape_from_ff_ordered({2_n, 20_n}, DataType::FLOAT);
         run_forward_test(input_shape, index_shape, output_shape);
       }
 
       SUBCASE("test gather forward, 1D") {
         TensorShape input_shape =
-            make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
+            make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT);
         TensorShape index_shape =
-            make_tensor_shape_from_legion_dims({10_n}, DataType::INT32);
+            make_tensor_shape_from_ff_ordered({10_n}, DataType::INT32);
         TensorShape output_shape =
-            make_tensor_shape_from_legion_dims({10_n}, DataType::FLOAT);
+            make_tensor_shape_from_ff_ordered({10_n}, DataType::FLOAT);
         run_forward_test(input_shape, index_shape, output_shape);
       }
     }
@@ -77,11 +77,11 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       SUBCASE("test gather backward, 2D") {
         TensorShape input_shape =
-            make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT);
+            make_tensor_shape_from_ff_ordered({2_n, 100_n}, DataType::FLOAT);
         TensorShape index_shape =
-            make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::INT32);
+            make_tensor_shape_from_ff_ordered({2_n, 25_n}, DataType::INT32);
         TensorShape output_shape =
-            make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::FLOAT);
+            make_tensor_shape_from_ff_ordered({2_n, 25_n}, DataType::FLOAT);
         run_backward_test(input_shape, index_shape, output_shape);
       }
     }
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
index c1be78bd16..e9fab697bb 100644
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ b/lib/kernels/test/src/test_partition_kernel.cc
@@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         managed_handle.raw_handle(), DataType::FLOAT);
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({10_n, 10_n}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({10_n, 10_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
diff --git a/lib/kernels/test/src/test_perf_metrics.cc b/lib/kernels/test/src/test_perf_metrics.cc
new file mode 100644
index 0000000000..e958a808b7
--- /dev/null
+++ b/lib/kernels/test/src/test_perf_metrics.cc
@@ -0,0 +1,127 @@
+#include "kernels/perf_metrics.h"
+#include "doctest/doctest.h"
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Test PerfMetrics Constructors and Metric Functions") {
+    SUBCASE("Test constructor with start_time only") {
+      double start = 100.0;
+      PerfMetrics pm(start);
+
+      CHECK(pm.start_time == start);
+      CHECK(pm.current_time == start);
+
+      CHECK(pm.train_all == 0);
+      if (pm.train_correct.has_value()) {
+        CHECK(pm.train_correct.value() == 0);
+      }
+      
+      CHECK(!pm.cce_loss.has_value());
+      
+      if (pm.sparse_cce_loss.has_value()) {
+        CHECK(pm.sparse_cce_loss.value() == doctest::Approx(0.0f));
+      }
+      if (pm.mse_loss.has_value()) {
+        CHECK(pm.mse_loss.value() == doctest::Approx(0.0f));
+      }
+      if (pm.rmse_loss.has_value()) {
+        CHECK(pm.rmse_loss.value() == doctest::Approx(0.0f));
+      }
+      if (pm.mae_loss.has_value()) {
+        CHECK(pm.mae_loss.value() == doctest::Approx(0.0f));
+      }
+    }
+
+    SUBCASE("Test full constructor and throughput/accuracy") {
+      int train_all = 200;
+      int train_correct = 150;
+      float cce = 1.2f;
+      float sparse_cce = 1.0f;
+      float mse = 0.5f;
+      float rmse = 0.7f;
+      float mae = 0.3f;
+      double start = 100.0;
+      double curr = 110.0;
+      PerfMetrics pm(train_all,
+                     train_correct,
+                     cce,
+                     sparse_cce,
+                     mse,
+                     rmse,
+                     mae,
+                     start,
+                     curr);
+
+      CHECK(pm.train_all == train_all);
+      CHECK(pm.train_correct.has_value());
+      CHECK(pm.train_correct.value() == train_correct);
+      CHECK(pm.cce_loss.has_value());
+      CHECK(pm.cce_loss.value() == doctest::Approx(cce));
+      CHECK(pm.sparse_cce_loss.has_value());
+      CHECK(pm.sparse_cce_loss.value() == doctest::Approx(sparse_cce));
+      CHECK(pm.mse_loss.has_value());
+      CHECK(pm.mse_loss.value() == doctest::Approx(mse));
+      CHECK(pm.rmse_loss.has_value());
+      CHECK(pm.rmse_loss.value() == doctest::Approx(rmse));
+      CHECK(pm.mae_loss.has_value());
+      CHECK(pm.mae_loss.value() == doctest::Approx(mae));
+      CHECK(pm.start_time == start);
+      CHECK(pm.current_time == curr);
+
+      float expected_throughput = train_all / (curr - start);
+      CHECK(get_throughput(pm) == doctest::Approx(expected_throughput));
+
+      float expected_accuracy = static_cast<float>(train_correct) / train_all;
+      CHECK(get_accuracy(pm) == doctest::Approx(expected_accuracy));
+    }
+
+    SUBCASE("Test update function") {
+      PerfMetrics pm1(100, 50, 1.0f, 0.5f, 0.3f, 0.2f, 0.1f, 0.0, 1.0);
+      PerfMetrics pm2(50, 30, 0.5f, 0.3f, 0.2f, 0.1f, 0.05f, 0.0, 1.5);
+
+      PerfMetrics updated = update(pm1, pm2);
+
+      CHECK(updated.train_all == (100 + 50));
+      if (updated.train_correct.has_value()) {
+        CHECK(updated.train_correct.value() == (50 + 30));
+      }
+
+      CHECK(updated.cce_loss.has_value());
+      CHECK(updated.cce_loss.value() == doctest::Approx(1.0f + 0.5f));
+      CHECK(updated.sparse_cce_loss.has_value());
+      CHECK(updated.sparse_cce_loss.value() == doctest::Approx(0.5f + 0.3f));
+      CHECK(updated.mse_loss.has_value());
+      CHECK(updated.mse_loss.value() == doctest::Approx(0.3f + 0.2f));
+      CHECK(updated.rmse_loss.has_value());
+      CHECK(updated.rmse_loss.value() == doctest::Approx(0.2f + 0.1f));
+      CHECK(updated.mae_loss.has_value());
+      CHECK(updated.mae_loss.value() == doctest::Approx(0.1f + 0.05f));
+      CHECK(updated.current_time == pm2.current_time);
+    }
+
+    SUBCASE("Test apply_scale function") {
+      PerfMetrics pm(100, 50, 2.0f, 1.0f, 0.8f, 0.6f, 0.4f, 0.0, 2.0);
+      float scale = 0.5f;
+      PerfMetrics scaled = apply_scale(pm, scale);
+
+      CHECK(scaled.cce_loss.has_value());
+      CHECK(scaled.cce_loss.value() == doctest::Approx(2.0f * scale));
+      CHECK(scaled.sparse_cce_loss.has_value());
+      CHECK(scaled.sparse_cce_loss.value() == doctest::Approx(1.0f * scale));
+      CHECK(scaled.mse_loss.has_value());
+      CHECK(scaled.mse_loss.value() == doctest::Approx(0.8f * scale));
+      CHECK(scaled.rmse_loss.has_value());
+      CHECK(scaled.rmse_loss.value() == doctest::Approx(0.6f * scale));
+      CHECK(scaled.mae_loss.has_value());
+      CHECK(scaled.mae_loss.value() == doctest::Approx(0.4f * scale));
+
+      CHECK(scaled.train_all == pm.train_all);
+      if (scaled.train_correct.has_value()) {
+        CHECK(scaled.train_correct.value() == pm.train_correct.value());
+      }
+      CHECK(scaled.start_time == pm.start_time);
+      CHECK(scaled.current_time == pm.current_time);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index ff74f6fb28..06db1989eb 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -49,9 +49,9 @@ TEST_SUITE(FF_TEST_SUITE) {
                                      /*stride_w=*/stride_w.unwrap_nonnegative(),
                                      /*pool_type=*/pool_type);
 
-    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+    TensorShape input_shape = make_tensor_shape_from_ff_ordered(
         {input_w, input_h, input_c, input_n}, DataType::FLOAT);
-    TensorShape output_shape = make_tensor_shape_from_legion_dims(
+    TensorShape output_shape = make_tensor_shape_from_ff_ordered(
         {output_w, output_h, output_c, output_n}, DataType::FLOAT);
 
     GenericTensorAccessorW input_accessor =
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index f91c4959cc..921a5ff08c 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -8,7 +8,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Reduction Forward and Backward Kernel") {
     std::size_t num_replicas = 5;
 
-    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+    TensorShape input_shape = make_tensor_shape_from_ff_ordered(
         {10_n, 10_n, 10_n, 10_n, 10_n}, DataType::FLOAT);
 
     ManagedPerDeviceFFHandle managed_handle{
@@ -20,7 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       TensorShape output_shape =
-          make_tensor_shape_from_legion_dims({10_n}, DataType::FLOAT);
+          make_tensor_shape_from_ff_ordered({10_n}, DataType::FLOAT);
 
       GenericTensorAccessorR input_accessor =
           create_random_filled_accessor_r(input_shape, allocator);
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 87834d83ac..6009b3c501 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -9,9 +9,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int num_replicas = 10_n;
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT);
     TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT);
 
     ManagedPerDeviceFFHandle managed_handle{
         /*workSpaceSize=*/1024 * 1024,
@@ -51,8 +51,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int num_replicas = 2_n;
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({5_n}, DataType::FLOAT);
-    TensorShape output_shape = make_tensor_shape_from_legion_dims(
+        make_tensor_shape_from_ff_ordered({5_n}, DataType::FLOAT);
+    TensorShape output_shape = make_tensor_shape_from_ff_ordered(
         {num_replicas, 5_n}, DataType::FLOAT);
 
     ManagedPerDeviceFFHandle managed_handle{
@@ -82,7 +82,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu,
                                              output_accessor_cpu);
 
-      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
+      CHECK(accessor_data_is_equal(output_accessor_gpu, output_accessor_cpu));
     }
 
     SUBCASE("backward_kernel") {
@@ -108,7 +108,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           input_grad_accessor_cpu,
           num_replicas.unwrap_nonnegative());
 
-      CHECK(accessors_are_equal(input_grad_accessor_gpu,
+      CHECK(accessor_data_is_equal(input_grad_accessor_gpu,
                                 input_grad_accessor_cpu));
     }
   }
diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc
index ee7530c017..fa67953947 100644
--- a/lib/kernels/test/src/test_reshape_kernel.cc
+++ b/lib/kernels/test/src/test_reshape_kernel.cc
@@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     ReshapePerDeviceState state =
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index 481958fdfc..78ee803da6 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -11,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int reverse_dim_size = 10_n;
     nonnegative_int in_blk_size = 10_n;
 
-    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+    TensorShape input_shape = make_tensor_shape_from_ff_ordered(
         {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
@@ -65,7 +65,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int reverse_dim_size = 4_n;
     nonnegative_int in_blk_size = 3_n;
 
-    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+    TensorShape input_shape = make_tensor_shape_from_ff_ordered(
         {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
@@ -106,7 +106,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           reverse_dim_size.unwrap_nonnegative(),
           in_blk_size.unwrap_nonnegative());
 
-      CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu));
+      CHECK(accessor_data_is_equal(output_accessor_cpu, output_accessor_cpu));
     }
 
     SUBCASE("backward_kernel") {
@@ -139,7 +139,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           reverse_dim_size.unwrap_nonnegative(),
           in_blk_size.unwrap_nonnegative());
 
-      CHECK(accessors_are_equal(input_grad_accessor_gpu,
+      CHECK(accessor_data_is_equal(input_grad_accessor_gpu,
                                 input_grad_accessor_cpu));
     }
   }
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index d4fb496f7b..ecb996227f 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -20,7 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SoftmaxPerDeviceState state =
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index d98f88a30e..20a6898896 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -21,9 +21,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT);
     TensorShape output_shape =
-        make_tensor_shape_from_legion_dims({50_n}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({50_n}, DataType::FLOAT);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index cac43c6ff3..ac8876ac98 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -20,7 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_tensor_shape_from_legion_dims({10_n, 10_n}, DataType::FLOAT);
+        make_tensor_shape_from_ff_ordered({10_n, 10_n}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
index bc5f48654a..e335e5b449 100644
--- a/lib/kernels/test/src/test_utils.cc
+++ b/lib/kernels/test/src/test_utils.cc
@@ -1,12 +1,13 @@
 #include "test_utils.h"
+#include "kernels/datatype_dispatch.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/join_strings.h"
 #include <random>
 
 namespace FlexFlow {
 
-TensorShape make_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims,
-                                               DataType DT) {
+TensorShape make_tensor_shape_from_ff_ordered(FFOrdered<nonnegative_int> dims,
+                                              DataType DT) {
   return TensorShape{
       TensorDims{
           dims,
@@ -128,26 +129,6 @@ bool contains_non_zero(GenericTensorAccessorR const &accessor) {
       cpu_accessor.data_type, cpu_accessor);
 }
 
-GenericTensorAccessorR
-    copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor,
-                                        Allocator &cpu_allocator) {
-  GenericTensorAccessorR cpu_accessor = accessor;
-  if (accessor.device_type == DeviceType::GPU) {
-    cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator);
-  }
-  return cpu_accessor;
-}
-
-GenericTensorAccessorW
-    copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor,
-                                        Allocator &cpu_allocator) {
-  GenericTensorAccessorW cpu_accessor = accessor;
-  if (accessor.device_type == DeviceType::GPU) {
-    cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator);
-  }
-  return cpu_accessor;
-}
-
 template <DataType DT>
 struct Print2DCPUAccessorR {
   void operator()(GenericTensorAccessorR const &accessor,
@@ -179,44 +160,6 @@ void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor,
       accessor.data_type, cpu_accessor, stream);
 }
 
-template <DataType DT>
-struct AccessorsAreEqual {
-  bool operator()(GenericTensorAccessorR const &accessor_a,
-                  GenericTensorAccessorR const &accessor_b) {
-    Allocator cpu_allocator = create_local_cpu_memory_allocator();
-    GenericTensorAccessorR cpu_accessor_a =
-        copy_accessor_r_to_cpu_if_necessary(accessor_a, cpu_allocator);
-    GenericTensorAccessorR cpu_accessor_b =
-        copy_accessor_r_to_cpu_if_necessary(accessor_b, cpu_allocator);
-
-    using T = real_type_t<DT>;
-    T const *a_data_ptr = cpu_accessor_a.get<DT>();
-    T const *b_data_ptr = cpu_accessor_b.get<DT>();
-
-    int volume = accessor_a.shape.num_elements().unwrap_nonnegative();
-    for (size_t i = 0; i < volume; i++) {
-      if (a_data_ptr[i] != b_data_ptr[i]) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-};
-
-bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
-                         GenericTensorAccessorR const &accessor_b) {
-  if (accessor_a.shape != accessor_b.shape) {
-    throw mk_runtime_error(
-        fmt::format("accessors_are_equal expected accessors to have the same "
-                    "shape, but received: {} != {}",
-                    accessor_a.shape,
-                    accessor_b.shape));
-  }
-  return DataTypeDispatch1<AccessorsAreEqual>{}(
-      accessor_a.data_type, accessor_a, accessor_b);
-}
-
 template <DataType DT>
 struct CreateFilledAccessorW {
   GenericTensorAccessorW operator()(TensorShape const &shape,
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
index 093a9a4a97..2e7294ed1d 100644
--- a/lib/kernels/test/src/test_utils.h
+++ b/lib/kernels/test/src/test_utils.h
@@ -2,7 +2,6 @@
 #define _FLEXFLOW_KERNELS_TEST_UTILS
 
 #include "kernels/copy_tensor_accessor.h"
-#include "kernels/datatype_dispatch.h"
 #include "kernels/device.h"
 #include "kernels/local_cpu_allocator.h"
 #include "kernels/local_cuda_allocator.h"
@@ -29,27 +28,16 @@ GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
 GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape,
                                                      Allocator &allocator);
 
-TensorShape make_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims,
-                                               DataType DT);
+TensorShape make_tensor_shape_from_ff_ordered(FFOrdered<nonnegative_int> dims,
+                                              DataType DT);
 
 bool contains_non_zero(GenericTensorAccessorR const &accessor);
 
 void fill_with_zeros(GenericTensorAccessorW const &accessor);
 
-GenericTensorAccessorW
-    copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor,
-                                        Allocator &allocator);
-
-GenericTensorAccessorR
-    copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor,
-                                        Allocator &allocator);
-
 void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor,
                                        std::ostream &stream);
 
-bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
-                         GenericTensorAccessorR const &accessor_b);
-
 GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
                                                 Allocator &allocator,
                                                 DataTypeValue val);

From 4fc04751c7b5550f19da89ac50a15ae8ad8ca1ee Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Mon, 24 Feb 2025 19:40:45 -0800
Subject: [PATCH 41/42] remove . files

---
 .envrc     | 3 ---
 .proj.toml | 1 -
 .vimrc     | 8 --------
 3 files changed, 12 deletions(-)
 delete mode 100644 .envrc
 delete mode 100644 .vimrc

diff --git a/.envrc b/.envrc
deleted file mode 100644
index 2797f0f929..0000000000
--- a/.envrc
+++ /dev/null
@@ -1,3 +0,0 @@
-source_up_if_exists
-
-use flake
diff --git a/.proj.toml b/.proj.toml
index b3b90bbada..10307a6efa 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -15,7 +15,6 @@ build_targets = [
   "models",
   "export-model-arch",
   "substitution-to-dot",
-  "kernels-tests",
 ]
 
 test_targets = [
diff --git a/.vimrc b/.vimrc
deleted file mode 100644
index 4c8a8a8279..0000000000
--- a/.vimrc
+++ /dev/null
@@ -1,8 +0,0 @@
-" example search path configuration
-set path=lib/runtime/**,lib/**
-
-" set build target
-" let g:target = "pcg"
-
-" set test target
-" let g:test_target = "utils-test"

From 8b72dcd360c5daa1391609b84eba12e3445d8383 Mon Sep 17 00:00:00 2001
From: Dylan Lim <dylaneverettlim@gmail.com>
Date: Mon, 24 Feb 2025 19:51:41 -0800
Subject: [PATCH 42/42] format issues

---
 lib/kernels/src/perf_metrics.cc                   | 4 ++--
 lib/kernels/test/src/test_combine_kernel.cc       | 2 +-
 lib/kernels/test/src/test_local_cpu_allocator.cc  | 2 +-
 lib/kernels/test/src/test_local_cuda_allocator.cc | 2 +-
 lib/kernels/test/src/test_perf_metrics.cc         | 6 +++---
 lib/kernels/test/src/test_replicate_kernel.cc     | 6 +++---
 lib/kernels/test/src/test_reverse_kernels.cc      | 2 +-
 7 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/lib/kernels/src/perf_metrics.cc b/lib/kernels/src/perf_metrics.cc
index ab0e113a26..61163caeae 100644
--- a/lib/kernels/src/perf_metrics.cc
+++ b/lib/kernels/src/perf_metrics.cc
@@ -15,8 +15,8 @@ PerfMetrics::PerfMetrics(int _train_all,
                          double _start_time_micro,
                          double _current_time_micro)
     : train_all(_train_all), train_correct(_train_correct), cce_loss(_cce_loss),
-      sparse_cce_loss(_sparse_cce_loss), mse_loss(_mse_loss), 
-      rmse_loss(_rmse_loss), mae_loss(_mae_loss), start_time(_start_time_micro), 
+      sparse_cce_loss(_sparse_cce_loss), mse_loss(_mse_loss),
+      rmse_loss(_rmse_loss), mae_loss(_mae_loss), start_time(_start_time_micro),
       current_time(_current_time_micro) {}
 
 float get_throughput(PerfMetrics const &m) {
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index b30d1ab7f4..3a7a70c862 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -95,7 +95,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu,
                                             input_grad_accessor_cpu);
 
-      CHECK(accessor_data_is_equal(input_grad_accessor_gpu, 
+      CHECK(accessor_data_is_equal(input_grad_accessor_gpu,
                                    input_grad_accessor_cpu));
     }
   }
diff --git a/lib/kernels/test/src/test_local_cpu_allocator.cc b/lib/kernels/test/src/test_local_cpu_allocator.cc
index fa6bce36db..d5552e4cb0 100644
--- a/lib/kernels/test/src/test_local_cpu_allocator.cc
+++ b/lib/kernels/test/src/test_local_cpu_allocator.cc
@@ -1,5 +1,5 @@
-#include "kernels/local_cpu_allocator.h"
 #include "doctest/doctest.h"
+#include "kernels/local_cpu_allocator.h"
 
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
diff --git a/lib/kernels/test/src/test_local_cuda_allocator.cc b/lib/kernels/test/src/test_local_cuda_allocator.cc
index c091576bd3..7c3e62dbeb 100644
--- a/lib/kernels/test/src/test_local_cuda_allocator.cc
+++ b/lib/kernels/test/src/test_local_cuda_allocator.cc
@@ -1,5 +1,5 @@
-#include "kernels/local_cuda_allocator.h"
 #include "doctest/doctest.h"
+#include "kernels/local_cuda_allocator.h"
 
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
diff --git a/lib/kernels/test/src/test_perf_metrics.cc b/lib/kernels/test/src/test_perf_metrics.cc
index e958a808b7..045788bae3 100644
--- a/lib/kernels/test/src/test_perf_metrics.cc
+++ b/lib/kernels/test/src/test_perf_metrics.cc
@@ -1,5 +1,5 @@
-#include "kernels/perf_metrics.h"
 #include "doctest/doctest.h"
+#include "kernels/perf_metrics.h"
 
 using namespace ::FlexFlow;
 
@@ -16,9 +16,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       if (pm.train_correct.has_value()) {
         CHECK(pm.train_correct.value() == 0);
       }
-      
+
       CHECK(!pm.cce_loss.has_value());
-      
+
       if (pm.sparse_cce_loss.has_value()) {
         CHECK(pm.sparse_cce_loss.value() == doctest::Approx(0.0f));
       }
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 6009b3c501..b2c8ea0c19 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -52,8 +52,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_shape =
         make_tensor_shape_from_ff_ordered({5_n}, DataType::FLOAT);
-    TensorShape output_shape = make_tensor_shape_from_ff_ordered(
-        {num_replicas, 5_n}, DataType::FLOAT);
+    TensorShape output_shape =
+        make_tensor_shape_from_ff_ordered({num_replicas, 5_n}, DataType::FLOAT);
 
     ManagedPerDeviceFFHandle managed_handle{
         /*workSpaceSize=*/1024 * 1024,
@@ -109,7 +109,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           num_replicas.unwrap_nonnegative());
 
       CHECK(accessor_data_is_equal(input_grad_accessor_gpu,
-                                input_grad_accessor_cpu));
+                                   input_grad_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index 78ee803da6..01eded4297 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -140,7 +140,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           in_blk_size.unwrap_nonnegative());
 
       CHECK(accessor_data_is_equal(input_grad_accessor_gpu,
-                                input_grad_accessor_cpu));
+                                   input_grad_accessor_cpu));
     }
   }
 }