From c0bf108c177b29a4f100f3625de7a3f40464148c Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Thu, 11 Jul 2024 14:33:42 -0700 Subject: [PATCH 01/42] test_utils refactor, local_cpu_allocator --- .../include/kernels/local_cpu_allocator.h | 22 ++ lib/kernels/src/local_cpu_allocator.cc | 35 ++ lib/kernels/test/src/test_attention_kernel.cc | 26 +- .../test/src/test_batch_matmul_kernel.cc | 10 +- .../test/src/test_batch_norm_kernel.cc | 54 ++-- lib/kernels/test/src/test_cast_kernel.cc | 84 +++-- lib/kernels/test/src/test_combine_kernel.cc | 81 ++++- lib/kernels/test/src/test_concat_kernel.cc | 33 +- lib/kernels/test/src/test_dropout.cc | 19 +- lib/kernels/test/src/test_flat_kernel.cc | 40 +-- lib/kernels/test/src/test_gather_kernels.cc | 29 +- .../test/src/test_layer_norm_kernels.cc | 25 +- lib/kernels/test/src/test_partition_kernel.cc | 41 +-- lib/kernels/test/src/test_pool_2d_kernels.cc | 32 +- lib/kernels/test/src/test_reduction_kernel.cc | 37 +-- lib/kernels/test/src/test_replicate_kernel.cc | 102 ++++-- lib/kernels/test/src/test_reshape_kernel.cc | 35 +- lib/kernels/test/src/test_reverse_kernels.cc | 103 +++++- lib/kernels/test/src/test_softmax_kernel.cc | 25 +- lib/kernels/test/src/test_split_kernel.cc | 19 +- lib/kernels/test/src/test_transpose_kernel.cc | 29 +- lib/kernels/test/src/test_utils.cc | 304 +++++++++++++----- lib/kernels/test/src/test_utils.h | 88 ++++- 23 files changed, 842 insertions(+), 431 deletions(-) create mode 100644 lib/kernels/include/kernels/local_cpu_allocator.h create mode 100644 lib/kernels/src/local_cpu_allocator.cc diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h new file mode 100644 index 0000000000..27dcc9d854 --- /dev/null +++ b/lib/kernels/include/kernels/local_cpu_allocator.h @@ -0,0 +1,22 @@ +#include "kernels/allocation.h" +#include + +namespace FlexFlow { + +struct LocalCPUAllocator : public IAllocator { + LocalCPUAllocator() = default; + LocalCPUAllocator(LocalCPUAllocator const &) = delete; + LocalCPUAllocator(LocalCPUAllocator &&) = delete; + ~LocalCPUAllocator() override; + + void *allocate(size_t) override; + void deallocate(void *) override; + +private: + std::unordered_set ptrs; +}; +CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator); + +Allocator create_local_cpu_memory_allocator(); + +} // namespace FlexFlow diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc new file mode 100644 index 0000000000..6553dc2f88 --- /dev/null +++ b/lib/kernels/src/local_cpu_allocator.cc @@ -0,0 +1,35 @@ +#include "kernels/local_cpu_allocator.h" +#include "kernels/device.h" + +namespace FlexFlow { +void *LocalCPUAllocator::allocate(size_t requested_memory_size) { + void *ptr = malloc(requested_memory_size); + if (ptr != nullptr) { + this->ptrs.insert(ptr); + } else { + throw std::bad_alloc(); + } + return ptr; +} + +void LocalCPUAllocator::deallocate(void *ptr) { + if (contains(this->ptrs, ptr)) { + free(ptr); + this->ptrs.erase(ptr); + } else { + throw std::runtime_error( + "Deallocating a pointer that was not allocated by this Allocator"); + } +} + +LocalCPUAllocator::~LocalCPUAllocator() { + for (auto ptr : ptrs) { + free(ptr); + } +} + +Allocator create_local_cpu_memory_allocator() { + return Allocator::create(); +} + +} // namespace FlexFlow diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index d44129ece1..023233ecb0 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -13,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) { size_t qoSeqLength = 20, kvSeqLength = 20; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -33,16 +35,16 @@ TEST_SUITE(FF_TEST_SUITE) { kvSeqLength, false); - TensorShape query_shape = make_float_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, qSize}); - TensorShape key_shape = make_float_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, kSize}); - TensorShape value_shape = make_float_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, vSize}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, oProjSize}); + TensorShape query_shape = make_tensor_shape_from_legion_dims( + {qoSeqLength, num_samples, qSize}, DataType::FLOAT); + TensorShape key_shape = make_tensor_shape_from_legion_dims( + {kvSeqLength, num_samples, kSize}, DataType::FLOAT); + TensorShape value_shape = make_tensor_shape_from_legion_dims( + {kvSeqLength, num_samples, vSize}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {qoSeqLength, num_samples, oProjSize}, DataType::FLOAT); TensorShape weight_shape = - make_float_tensor_shape_from_legion_dims({state.weightSize}); + make_tensor_shape_from_legion_dims({state.weightSize}, DataType::FLOAT); GenericTensorAccessorW query_accessor = create_random_filled_accessor_w(query_shape, allocator); @@ -66,9 +68,7 @@ TEST_SUITE(FF_TEST_SUITE) { weight_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index 18e6977148..8a11a069f5 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -15,16 +15,18 @@ TEST_SUITE(FF_TEST_SUITE) { size_t seq_length = -1; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape_a = - make_float_tensor_shape_from_legion_dims({m, k, batch}); + make_tensor_shape_from_legion_dims({m, k, batch}, DataType::FLOAT); TensorShape input_shape_b = - make_float_tensor_shape_from_legion_dims({k, n, batch}); + make_tensor_shape_from_legion_dims({k, n, batch}, DataType::FLOAT); TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({m, n, batch}); + make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT); GenericTensorAccessorW a_accessor = create_random_filled_accessor_w(input_shape_a, allocator); diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 8487bbda6a..03a3a1ad40 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/batch_norm_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -9,7 +10,9 @@ TEST_SUITE(FF_TEST_SUITE) { size_t output_n = 1, output_c = 10, output_h = 10, output_w = 10; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -23,25 +26,25 @@ TEST_SUITE(FF_TEST_SUITE) { output_w, true); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape scale_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape bias_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); + TensorShape scale_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); + TensorShape bias_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); - GenericTensorAccessorW scale_accessor = - create_filled_accessor_w(scale_shape, allocator, 1.0f); + GenericTensorAccessorW scale_accessor = create_filled_accessor_w( + scale_shape, allocator, make_float_data_type_value(1)); SUBCASE("forward_kernel") { - GenericTensorAccessorW bias_accessor = - create_filled_accessor_w(bias_shape, allocator, 0.0f); + GenericTensorAccessorW bias_accessor = create_filled_accessor_w( + bias_shape, allocator, make_float_data_type_value(0)); Kernels::BatchNorm::forward_kernel(managed_stream.raw_stream(), state, @@ -50,10 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) { scale_accessor.get_float_ptr(), bias_accessor.get_float_ptr()); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { @@ -68,28 +68,18 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(), state, - input_accessor.get_float_ptr(), - output_grad_accessor.get_float_ptr(), output_accessor.get_float_ptr(), + output_grad_accessor.get_float_ptr(), + input_accessor.get_float_ptr(), input_grad_accessor.get_float_ptr(), scale_accessor.get_float_ptr(), scale_grad_accessor.get_float_ptr(), bias_grad_accessor.get_float_ptr(), input_accessor.shape.num_elements()); - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - std::vector host_scale_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(scale_grad_accessor)); - std::vector host_bias_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(bias_grad_accessor)); - - CHECK(contains_non_zero(host_input_grad_data)); - CHECK(contains_non_zero(host_scale_grad_data)); - CHECK(contains_non_zero(host_bias_grad_data)); + CHECK(contains_non_zero(input_grad_accessor)); + CHECK(contains_non_zero(scale_grad_accessor)); + CHECK(contains_non_zero(bias_grad_accessor)); } Kernels::BatchNorm::cleanup_kernel(allocator, diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index b110208bce..1be5839a9c 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -1,7 +1,7 @@ #include "doctest/doctest.h" #include "kernels/cast_kernels.h" +#include "kernels/cast_kernels_cpu.h" #include "test_utils.h" -#include using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { @@ -11,46 +11,68 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); TensorShape output_shape = - make_double_tensor_shape_from_legion_dims({100, 100}); - - GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + make_tensor_shape_from_legion_dims({100, 100}, DataType::DOUBLE); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - - Kernels::Cast::forward_kernel(managed_stream.raw_stream(), - input_accessor, - output_accessor, - DataType::FLOAT, - DataType::DOUBLE); + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); - std::vector host_double_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor, output_accessor); - CHECK(contains_non_zero(host_double_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { + GenericTensorAccessorR grad_output_accessor = + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW grad_input_accessor = - allocator.allocate_tensor(input_shape); - - Kernels::Cast::backward_kernel( - managed_stream.raw_stream(), - read_only_accessor_from_write_accessor(output_accessor), - grad_input_accessor, - DataType::DOUBLE, - DataType::FLOAT); - - std::vector host_grad_float_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(grad_input_accessor)); - CHECK(contains_non_zero(host_grad_float_data)); + create_zero_filled_accessor_w(input_shape, allocator); + + Kernels::Cast::backward_kernel(managed_stream.raw_stream(), + grad_output_accessor, + grad_input_accessor); + + CHECK(contains_non_zero(grad_input_accessor)); + } + } + + TEST_CASE("Check Cast Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = + make_tensor_shape_from_legion_dims({10, 2}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({10, 2}, DataType::DOUBLE); + + // Only calling forward kernel as backward kernel is exactly the same + SUBCASE("forward_kernel") { + // Run GPU Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Cast::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 2e1000cb95..a4688a1030 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -1,39 +1,37 @@ #include "doctest/doctest.h" #include "kernels/combine_kernels.h" +#include "kernels/combine_kernels_cpu.h" #include "test_utils.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test combine kernel") { - ManagedPerDeviceFFHandle managed_handle{}; + TEST_CASE("Call Combine Forward and Backward Kernels") { + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Combine::forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -41,9 +39,64 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor, input_grad_accessor); - std::vector host_input_grad = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad)); + CHECK(contains_non_zero(input_grad_accessor)); + } + } + + TEST_CASE("Check Combine Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = + make_tensor_shape_from_legion_dims({5, 5}, DataType::FLOAT); + TensorShape output_shape = input_shape; + + SUBCASE("forward_kernel") { + // Run GPU Combine Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(output_shape); + + Kernels::Combine::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Combine Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(output_shape); + + Kernels::Combine::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); + } + + SUBCASE("backward_kernel") { + // Run GPU Combine Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Combine::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu); + + // Run CPU Combine Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu, + input_grad_accessor_cpu); + + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 2212e384fa..4607171a54 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -1,21 +1,24 @@ #include "doctest/doctest.h" #include "kernels/concat_kernels.h" #include "test_utils.h" +#include "utils/containers/repeat.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test concat kernel forward and backward") { - size_t num_inputs = 3; - size_t size_per_input = 100; - ff_dim_t concat_axis = ff_dim_t{nonnegative_int{0}}; + size_t num_inputs = 2; + size_t size_per_input = 10; + ff_dim_t concat_axis = ff_dim_t{1}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({size_per_input}); - TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({size_per_input, num_inputs}); + make_tensor_shape_from_legion_dims({size_per_input}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {num_inputs, size_per_input}, DataType::FLOAT); Allocator allocator = create_local_cuda_memory_allocator(); @@ -33,21 +36,15 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessors, concat_axis); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); - std::vector input_grad_accessors = - repeat(num_inputs, [&]() { - return allocator.allocate_tensor(input_shape); - }); + create_random_filled_accessor_r(output_shape, allocator); + std::vector input_grad_accessors = repeat( + num_inputs, [&]() { return allocator.allocate_tensor(input_shape); }); + Kernels::Concat::backward_kernel(managed_stream.raw_stream(), output_grad_accessor, input_grad_accessors, diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index e29143e251..4be2bdf7bb 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -1,6 +1,7 @@ #include "doctest/doctest.h" #include "kernels/dropout_kernels.h" #include "test_utils.h" +#include "utils/containers/count.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { @@ -13,11 +14,13 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10, 10}); + make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT); TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -25,14 +28,12 @@ TEST_SUITE(FF_TEST_SUITE) { managed_handle.raw_handle(), dropout_rate, seed, shape, allocator); auto get_zero_count = [](std::vector const &data) { - return std::count_if( - data.begin(), data.end(), [](float x) { return x == 0.0f; }); + return count(data, [](float x) { return x == 0.0f; }); }; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -41,11 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_accessor = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - CHECK(contains_non_zero(host_output_accessor)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 70894858e3..0bb69aa1dc 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/flat_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -7,15 +8,18 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Flat Kernel") { Allocator allocator = create_local_cuda_memory_allocator(); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = input_shape; GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 2.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(2))); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -25,33 +29,21 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor, output_accessor.get_float_ptr()); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements(), 2.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 0.0f); - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(0)); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(1)); Kernels::Flat::backward_kernel(managed_stream.raw_stream(), input_accessor, - input_grad_accessor.get_float_ptr(), - output_grad_accessor.get_float_ptr()); - - std::vector backward_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor.get_float_ptr(), + input_grad_accessor.get_float_ptr()); - std::vector expected_output_data( - input_accessor.shape.num_elements(), 1.0f); - CHECK(backward_output_data == expected_output_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 88ac2f6889..7f97563217 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -5,24 +5,26 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Gather Forward and Backward Kernel") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); GatherPerDeviceState state = {managed_handle.raw_handle(), legion_dim_t(2)}; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({50}, DataType::FLOAT); GenericTensorAccessorR index_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -32,16 +34,12 @@ TEST_SUITE(FF_TEST_SUITE) { index_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = create_random_filled_accessor_w(input_shape, allocator); @@ -51,10 +49,7 @@ TEST_SUITE(FF_TEST_SUITE) { index_accessor, input_grad_accessor); - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 03b2f56bb9..7d7298f83d 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/layer_norm_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -11,13 +12,15 @@ TEST_SUITE(FF_TEST_SUITE) { float epsilon = 1e-5f; bool elementwise_affine = true; - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({batch_size, feature_size}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {batch_size, feature_size}, DataType::FLOAT); TensorShape output_shape = input_shape; TensorShape feature_shape = - make_float_tensor_shape_from_legion_dims({feature_size}); + make_tensor_shape_from_legion_dims({feature_size}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -31,16 +34,15 @@ TEST_SUITE(FF_TEST_SUITE) { epsilon); GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - GenericTensorAccessorW gamma_accessor = - create_filled_accessor_w(feature_shape, allocator, 1.0f); + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorW gamma_accessor = create_filled_accessor_w( + feature_shape, allocator, make_float_data_type_value(1)); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - GenericTensorAccessorW beta_accessor = - create_filled_accessor_w(feature_shape, allocator, 0.0f); + GenericTensorAccessorW beta_accessor = create_filled_accessor_w( + feature_shape, allocator, make_float_data_type_value(0)); Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(), state, @@ -52,8 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW gamma_grad_accessor = diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 437b37e954..e88c811803 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -1,12 +1,15 @@ #include "doctest/doctest.h" #include "kernels/partition_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Partition Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -15,47 +18,33 @@ TEST_SUITE(FF_TEST_SUITE) { managed_handle.raw_handle(), DataType::FLOAT); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10, 10}); + make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + GenericTensorAccessorR input_accessor = create_filled_accessor_r( + input_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Repartition::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(1)); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(2)); Kernels::Repartition::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); - - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor, + input_grad_accessor); - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements(), 3.0f); - CHECK(host_grad_input_data == expected_grad_input_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index ebb92d39db..00fa968235 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/pool_2d_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -12,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) { PoolOp pool_type = PoolOp::MAX; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -36,10 +39,10 @@ TEST_SUITE(FF_TEST_SUITE) { stride_w, pool_type); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {input_w, input_h, input_c, input_n}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {output_w, output_h, output_c, output_n}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {input_w, input_h, input_c, input_n}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {output_w, output_h, output_c, output_n}, DataType::FLOAT); GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); @@ -52,28 +55,23 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.ptr, output_accessor.ptr); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w( + output_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Pool2D::backward_kernel(managed_stream.raw_stream(), state, - input_accessor.ptr, - input_grad_accessor.ptr, output_accessor.ptr, - output_grad_accessor.ptr); + output_grad_accessor.ptr, + input_accessor.ptr, + input_grad_accessor.ptr); - std::vector host_input_grad = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 1ea740f336..1c389cb20d 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/reduction_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -7,20 +8,22 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Reduction Forward and Backward Kernel") { std::size_t num_replicas = 5; - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10, 10, 10, 10, 10}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {10, 10, 10, 10, 10}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({10}); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({10}, DataType::FLOAT); GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -29,30 +32,22 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor, num_replicas); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { TensorShape output_shape = input_shape; - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Reduction::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor, - output_grad_accessor); - - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements(), 1.0f); - std::vector host_grad_data = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(host_grad_data == expected_grad_input_data); + output_grad_accessor, + input_grad_accessor); + + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 86d790f03c..27223cc7b5 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -1,55 +1,113 @@ #include "doctest/doctest.h" #include "kernels/replicate_kernels.h" +#include "kernels/replicate_kernels_cpu.h" #include "test_utils.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test Replicate Kernel") { + TEST_CASE("Call Replicate Forward and Backward Kernels") { std::size_t num_replicas = 10; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); - TensorShape output_shape = input_shape; + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Replicate::forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_random_filled_accessor_r(output_shape, allocator); + GenericTensorAccessorW input_grad_accessor = + allocator.allocate_tensor(input_shape); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor, output_grad_accessor, + input_grad_accessor, + num_replicas); + + CHECK(contains_non_zero(input_grad_accessor)); + } + } + + TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") { + std::size_t num_replicas = 2; + + TensorShape input_shape = + make_tensor_shape_from_legion_dims({5}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT); + + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("forward_kernel") { + // Run GPU Replicate Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Replicate::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Replicate Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); + } + + SUBCASE("backward_kernel") { + // Run GPU Replicate Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu, num_replicas); - std::vector check_aggregated_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(check_aggregated_data)); + // Run CPU Replicate Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Replicate::cpu_backward_kernel( + output_grad_accessor_cpu, input_grad_accessor_cpu, num_replicas); + + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index f56bfacc2b..5c04012da2 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -5,12 +5,15 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Reshape Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = input_shape; ReshapePerDeviceState state = @@ -18,42 +21,28 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Reshape::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + allocator.allocate_tensor(input_shape); Kernels::Reshape::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); - - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor, + input_grad_accessor); - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements(), 3.0f); - CHECK(host_grad_input_data == expected_grad_input_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index cdaf65a305..4adf79847a 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -1,5 +1,7 @@ #include "doctest/doctest.h" #include "kernels/reverse_kernels.h" +#include "kernels/reverse_kernels_cpu.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -9,18 +11,21 @@ TEST_SUITE(FF_TEST_SUITE) { std::size_t in_blk_size = 10; std::size_t num_out_blks = 1; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(1))); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -32,17 +37,14 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_accessor.shape.num_elements()); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(check_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + allocator.allocate_tensor(input_shape); Kernels::Reverse::backward_kernel( managed_stream.raw_stream(), @@ -53,10 +55,85 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_grad_accessor.shape.num_elements()); - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_grad_input_data)); + CHECK(contains_non_zero(input_grad_accessor)); + } + } + + TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { + std::size_t num_out_blks = 4; + std::size_t reverse_dim_size = 3; + std::size_t in_blk_size = 2; + + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); + TensorShape output_shape = input_shape; + + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("forward_kernel") { + auto transform = [counter = 0.0f](float val) mutable { + return counter++; + }; + + // Run GPU Cast Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), + input_accessor_gpu.get_float_ptr(), + output_accessor_gpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_accessor_gpu.shape.num_elements()); + + // Run CPU Cast Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu)); + } + + SUBCASE("backward_kernel") { + // Run GPU Cast Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Reverse::backward_kernel( + managed_stream.raw_stream(), + output_grad_accessor_gpu.get_float_ptr(), + input_grad_accessor_gpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_grad_accessor_gpu.shape.num_elements()); + + // Run CPU Cast Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu, + input_grad_accessor_cpu); + + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index f49c1ebbcc..5519c30b80 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -8,12 +8,15 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Softmax Kernel Operations") { int input_n = 1, input_c = 1, input_h = 1, input_w = 100, channels = 100; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = input_shape; SoftmaxPerDeviceState state = Kernels::Softmax::init_kernel( @@ -31,30 +34,22 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Softmax::backward_kernel( managed_stream.raw_stream(), - input_grad_accessor.get_float_ptr(), output_grad_accessor.get_float_ptr(), + input_grad_accessor.get_float_ptr(), output_grad_accessor.shape.num_elements()); - std::vector expected_input_grad_data = - std::vector(input_grad_accessor.shape.num_elements(), 1.0f); - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(host_input_grad_data == expected_input_grad_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index f2346c9244..34993fa151 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -1,6 +1,8 @@ #include "doctest/doctest.h" #include "kernels/split_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" +#include "utils/containers/repeat.h" using namespace ::FlexFlow; @@ -11,20 +13,23 @@ TEST_SUITE(FF_TEST_SUITE) { coord_t in_blk_size = 100; coord_t num_blks = 1; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({50}, DataType::FLOAT); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); - std::vector output_ptrs(num_outputs); - generate_n(output_ptrs.begin(), num_outputs, [&]() { + std::vector output_ptrs = repeat(num_outputs, [&]() { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); return output_accessor.get_float_ptr(); @@ -47,8 +52,8 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 0.0f); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(0)); Kernels::Split::backward_kernel(managed_stream.raw_stream(), input_grad_accessor.get_float_ptr(), diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index 2904fa01ae..0bc85cb8e0 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -7,10 +7,11 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Transpose Kernel Operations") { std::size_t num_dims = 2; - std::vector perm = {ff_dim_t{nonnegative_int{0}}, - ff_dim_t{nonnegative_int{1}}}; + std::vector perm = {ff_dim_t{0}, ff_dim_t{1}}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -19,41 +20,33 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Transpose::init_kernel(num_dims, perm); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10, 10}); + make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Transpose::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = create_random_filled_accessor_w(input_shape, allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); + output_grad_accessor, + input_grad_accessor); - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_grad_input_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index b591642570..bfed1241ba 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -1,105 +1,249 @@ #include "test_utils.h" +#include "op-attrs/tensor_shape.h" +#include -GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); - std::vector host_data(volume); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dist(-1.0f, 1.0f); - - for (auto &val : host_data) { - val = dist(gen); +namespace FlexFlow { + +GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape); + fill_with_zeros(result_accessor); + return result_accessor; +} + +TensorShape + make_tensor_shape_from_legion_dims(LegionOrdered const &dims, + DataType DT) { + return TensorShape{ + TensorDims{ + ff_ordered_from_legion_ordered(dims), + }, + DT, + }; +} + +template +struct CreateRandomFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, + Allocator &allocator) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); + + using T = real_type_t
; + T *data_ptr = src_accessor.get
(); + + std::random_device rd; + std::mt19937 gen(rd()); + size_t num_elements = get_num_elements(shape); + if constexpr (std::is_same::value) { + std::bernoulli_distribution dist(0.5); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } else if constexpr (std::is_floating_point::value) { + std::uniform_real_distribution dist(-1.0, 1.0); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } else if constexpr (std::is_integral::value) { + std::uniform_int_distribution dist(0, 100); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } + + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return dst_accessor; } +}; - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); +GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, + Allocator &allocator) { + return DataTypeDispatch1{}( + shape.data_type, shape, allocator); +} + +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_random_filled_accessor_w(shape, allocator); + + return read_only_accessor_from_write_accessor(accessor); +} + +template +struct FillWithZeros { + void operator()(GenericTensorAccessorW const &accessor) { + using T = real_type_t
; + + if (accessor.device_type == DeviceType::CPU) { + memset(accessor.ptr, 0, accessor.shape.get_volume() * sizeof(T)); + } else { + checkCUDA( + cudaMemset(accessor.ptr, 0, accessor.shape.get_volume() * sizeof(T))); + } } +}; - return accessor; +void fill_with_zeros(GenericTensorAccessorW const &accessor) { + DataTypeDispatch1{}(accessor.data_type, accessor); } -GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - float val, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); - std::vector host_data(volume, val); - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); +template +struct CPUAccessorRContainsNonZero { + bool operator()(GenericTensorAccessorR const &accessor) { + using T = real_type_t
; + + T const *data_ptr = accessor.get
(); + + for (size_t i = 0; i < accessor.shape.num_elements(); i++) { + if (data_ptr[i] != 0) { + return true; + } + } + + return false; } +}; - return accessor; +bool contains_non_zero(GenericTensorAccessorR const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); + return DataTypeDispatch1{}( + cpu_accessor.data_type, cpu_accessor); } -GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); - std::vector host_data(volume); +GenericTensorAccessorR + copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, + Allocator &cpu_allocator) { + GenericTensorAccessorR cpu_accessor = accessor; + if (accessor.device_type == DeviceType::GPU) { + cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator); + } + return cpu_accessor; +} - for (size_t i = 0; i < volume; i++) { - host_data[i] = i; +GenericTensorAccessorW + copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor, + Allocator &cpu_allocator) { + GenericTensorAccessorW cpu_accessor = accessor; + if (accessor.device_type == DeviceType::GPU) { + cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator); } + return cpu_accessor; +} + +template +struct Print2DCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + using T = real_type_t
; + + T const *data_ptr = accessor.get
(); + int rows = accessor.shape.at(legion_dim_t{0}); + int cols = accessor.shape.at(legion_dim_t{1}); - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + stream << data_ptr[i * cols + j]; + + if (j < cols - 1) { + stream << " "; + } + } + stream << std::endl; + } } +}; - return accessor; +void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); + DataTypeDispatch1{}( + accessor.data_type, accessor, stream); } -void fill_tensor_accessor_w(GenericTensorAccessorW accessor, - float val, - bool cpu_fill) { - LegionTensorDims dims = accessor.shape.dims; - size_t volume = accessor.shape.num_elements(); - std::vector host_data(volume, val); - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); +template +struct AccessorsAreEqual { + bool operator()(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor_a = + copy_accessor_r_to_cpu_if_necessary(accessor_a, cpu_allocator); + GenericTensorAccessorR cpu_accessor_b = + copy_accessor_r_to_cpu_if_necessary(accessor_b, cpu_allocator); + + using T = real_type_t
; + T const *a_data_ptr = cpu_accessor_a.get
(); + T const *b_data_ptr = cpu_accessor_b.get
(); + + for (size_t i = 0; i < accessor_a.shape.num_elements(); i++) { + if (a_data_ptr[i] != b_data_ptr[i]) { + return false; + } + } + + return true; + } +}; + +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + if (accessor_a.shape != accessor_b.shape) { + throw mk_runtime_error( + fmt::format("accessors_are_equal expected accessors to have the same " + "shape, but received: {} != {}", + accessor_a.shape, + accessor_b.shape)); } + return DataTypeDispatch1{}( + accessor_a.data_type, accessor_a, accessor_b); } -TensorShape make_float_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DataType::FLOAT, - }; +template +struct CreateFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + using T = real_type_t
; + if (!val.template has()) { + throw mk_runtime_error("create_filed_accessor expected data type of " + "shape and passed-in value to match"); + } + + auto unwrapped_value = val.get(); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); + + T *data_ptr = src_accessor.get
(); + for (size_t i = 0; i < dst_accessor.shape.num_elements(); i++) { + data_ptr[i] = unwrapped_value; + } + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + return dst_accessor; + } +}; + +GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + + return DataTypeDispatch1{}( + shape.data_type, shape, allocator, val); } -TensorShape make_double_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DataType::DOUBLE, - }; +GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + GenericTensorAccessorW w_accessor = + create_filled_accessor_w(shape, allocator, val); + return read_only_accessor_from_write_accessor(w_accessor); } +} // namespace FlexFlow diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index 21d4923881..8a063fea17 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_KERNELS_TEST_UTILS #include "kernels/device.h" +#include "kernels/local_cpu_allocator.h" #include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" @@ -13,35 +14,96 @@ using namespace FlexFlow; +template GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, Allocator &allocator, - bool cpu_fill = false); + bool cpu_fill = false) { + GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + size_t volume = accessor.shape.num_elements(); + std::vector
host_data(volume); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution
dist(-1.0f, 1.0f); + for (auto &val : host_data) { + val = dist(gen); + } + + if (cpu_fill) { + memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT)); + } else { + checkCUDA(cudaMemcpy(accessor.ptr, + host_data.data(), + host_data.size() * sizeof(DT), + cudaMemcpyHostToDevice)); + } + + return accessor; +} + +template GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, Allocator &allocator, - float val, - bool cpu_fill = false); + DT val, + bool cpu_fill = false) { + GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + size_t volume = accessor.shape.num_elements(); + std::vector
host_data(volume, val); + + if (cpu_fill) { + memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT)); + } else { + checkCUDA(cudaMemcpy(accessor.ptr, + host_data.data(), + host_data.size() * sizeof(DT), + cudaMemcpyHostToDevice)); + } + return accessor; +} + +template GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, Allocator &allocator, - bool cpu_fill = false); + bool cpu_fill = false) { + GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + size_t volume = accessor.shape.num_elements(); + std::vector
host_data(volume); + + for (size_t i = 0; i < volume; i++) { + host_data[i] = i; + } -void fill_tensor_accessor_w(GenericTensorAccessorW accessor, - float val, - bool cpu_fill = false); + if (cpu_fill) { + memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT)); + } else { + checkCUDA(cudaMemcpy(accessor.ptr, + host_data.data(), + host_data.size() * sizeof(DT), + cudaMemcpyHostToDevice)); + } -TensorShape make_float_tensor_shape_from_legion_dims(FFOrdered dims); + return accessor; +} -TensorShape make_double_tensor_shape_from_legion_dims(FFOrdered dims); +template +TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims) { + return TensorShape{ + TensorDims{ + dims, + }, + DT, + }; +} -template -std::vector load_data_to_host_from_device(GenericTensorAccessorR accessor) { +template +std::vector
load_data_to_host_from_device(GenericTensorAccessorR accessor) { int volume = accessor.shape.get_volume(); - std::vector local_data(volume); + std::vector
local_data(volume); checkCUDA(cudaMemcpy(local_data.data(), accessor.ptr, - local_data.size() * sizeof(T), + local_data.size() * sizeof(DT), cudaMemcpyDeviceToHost)); return local_data; } From 47ad0d83e894ff78304a81d3f0464b779c8e1420 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Fri, 12 Jul 2024 12:54:48 -0700 Subject: [PATCH 02/42] test utils modification, cast, reverse, and replicate cpu kernels --- .../include/kernels/cast_kernels_cpu.h | 27 ++++ .../include/kernels/replicate_kernels_cpu.h | 24 +++ .../include/kernels/reverse_kernels_cpu.h | 29 ++++ lib/kernels/src/cpu/cast_kernels.cc | 59 ++++++++ lib/kernels/src/cpu/combine_kernels.cc | 0 lib/kernels/src/cpu/replicate_kernels.cc | 61 ++++++++ lib/kernels/src/cpu/reverse_kernels.cc | 49 +++++++ lib/kernels/src/cuda/ops/reverse_kernels.cu | 36 ++++- lib/kernels/test/src/test_cast_kernel.cc | 56 +++++++ lib/kernels/test/src/test_replicate_kernel.cc | 86 +++++++++++ lib/kernels/test/src/test_reverse_kernels.cc | 105 +++++++++++++ lib/kernels/test/src/test_utils.h | 138 +++++++++++------- 12 files changed, 610 insertions(+), 60 deletions(-) create mode 100644 lib/kernels/include/kernels/cast_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/replicate_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/reverse_kernels_cpu.h create mode 100644 lib/kernels/src/cpu/cast_kernels.cc create mode 100644 lib/kernels/src/cpu/combine_kernels.cc create mode 100644 lib/kernels/src/cpu/replicate_kernels.cc create mode 100644 lib/kernels/src/cpu/reverse_kernels.cc diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h new file mode 100644 index 0000000000..df4ef22b93 --- /dev/null +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -0,0 +1,27 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H + +#include "device.h" +#include "kernels/accessor.h" + +namespace FlexFlow { +namespace Kernels { +namespace Cast { +namespace CPU { + +void forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type); + +void backward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type); + +} // namespace CPU +} // namespace Cast +} // namespace Kernels +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h new file mode 100644 index 0000000000..4bc97f00ef --- /dev/null +++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h @@ -0,0 +1,24 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H + +#include "device.h" +#include "kernels/accessor.h" + +namespace FlexFlow { +namespace Kernels { +namespace Replicate { +namespace CPU { + +void forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void backward_kernel(GenericTensorAccessorW const &input, + GenericTensorAccessorR const &output, + size_t num_replicas); + +} // namespace CPU +} // namespace Replicate +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h new file mode 100644 index 0000000000..89ed6ffdb4 --- /dev/null +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -0,0 +1,29 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H + +#include "device.h" + +namespace FlexFlow { +namespace Kernels { +namespace Reverse { +namespace CPU { + +void forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t output_size); + +void backward_kernel(float const *out_grad_ptr, + float *in_grad_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t input_size); +} // namespace CPU +} // namespace Reverse +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc new file mode 100644 index 0000000000..cf73a84b93 --- /dev/null +++ b/lib/kernels/src/cpu/cast_kernels.cc @@ -0,0 +1,59 @@ +#include "kernels/cast_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow { +namespace Kernels { +namespace Cast { +namespace CPU { + +template +void cast_forward(IDT const *input, ODT *output, size_t volume) { + for (size_t i = 0; i < volume; ++i) { + output[i] = static_cast(input[i]); + } +} + +template +void cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) { + for (size_t i = 0; i < volume; i++) { + output[i] = static_cast(input[i]) + beta * output[i]; + } +} + +template +struct ForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + size_t volume = input.shape.get_volume(); + cast_forward(input.get(), output.get(), volume); + } +}; + +template +struct BackwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + size_t volume = input.shape.get_volume(); + cast_backward( + input.get(), output.get(), volume, cast_to(1.0f)); + } +}; + +void forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type) { + DataTypeDispatch2{}(input_type, output_type, input, output); +} + +void backward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type) { + DataTypeDispatch2{}(input_type, output_type, input, output); +} + +} // namespace CPU +} // namespace Cast +} // namespace Kernels +} // namespace FlexFlow diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc new file mode 100644 index 0000000000..5f63d29691 --- /dev/null +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -0,0 +1,61 @@ +#include "kernels/datatype_dispatch.h" +#include "kernels/replicate_kernels_cpu.h" + +namespace FlexFlow { +namespace Kernels { +namespace Replicate { +namespace CPU { + +template +void replicate_backward_kernel(T *input, + T const *output, + size_t num_elements, + size_t num_replicas) { + for (size_t i = 0; i < num_elements; ++i) { + T sum = 0; + for (size_t j = 0; j < num_replicas; ++j) { + sum += output[j * num_elements + i]; + } + input[i] = sum; + } +} + +// Why does replicate forward seem to only transfer memory? Shouldn't it also +// handle the replication? +template +struct ForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + memcpy(output.get(), + input.get(), + input.shape.num_elements() * size_of_datatype(T)); + } +}; + +template +struct BackwardKernel { + void operator()(GenericTensorAccessorW const &input, + GenericTensorAccessorR const &output, + size_t num_replicas) { + size_t total_elements = input.shape.num_elements() * num_replicas; + replicate_backward_kernel( + input.get(), output.get(), total_elements, num_replicas); + } +}; + +void forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, input, output); +} + +void backward_kernel(GenericTensorAccessorW const &input, + GenericTensorAccessorR const &output, + size_t num_replicas) { + DataTypeDispatch1{}( + input.data_type, input, output, num_replicas); +} + +} // namespace CPU +} // namespace Replicate +} // namespace Kernels +} // namespace FlexFlow diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc new file mode 100644 index 0000000000..ac8ae26ca2 --- /dev/null +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -0,0 +1,49 @@ +#include "kernels/reverse_kernels_cpu.h" +#include + +namespace FlexFlow { +namespace Kernels { +namespace Reverse { +namespace CPU { + +void reverse_forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size) { + coord_t total_elements = num_out_blks * reverse_dim_size * in_blk_size; + for (coord_t i = 0; i < total_elements; ++i) { + coord_t blk_idx = i / (reverse_dim_size * in_blk_size); + coord_t offset = i - blk_idx * (reverse_dim_size * in_blk_size); + coord_t reverse_dim_idx = offset / in_blk_size; + coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + + (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + + (offset % in_blk_size); + out_ptr[i] = in_ptr[in_idx]; + } +} + +void forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t output_size) { + reverse_forward_kernel( + in_ptr, out_ptr, num_out_blks, reverse_dim_size, in_blk_size); +} + +void backward_kernel(float const *out_grad_ptr, + float *in_grad_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t input_size) { + reverse_forward_kernel( + out_grad_ptr, in_grad_ptr, num_out_blks, reverse_dim_size, in_blk_size); +} + +} // namespace CPU +} // namespace Reverse +} // namespace Kernels +} // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index 8391a499df..f73c57dedf 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -21,6 +21,29 @@ namespace FlexFlow { namespace Kernels { namespace Reverse { +// __global__ void reverse_forward_kernel(float const *in_ptr, +// float *out_ptr, +// coord_t num_out_blks, +// coord_t reverse_dim_size, +// coord_t in_blk_size) { +// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { +// coord_t blk_idx = i / (reverse_dim_size * in_blk_size); +// i = i - blk_idx * (reverse_dim_size * in_blk_size); +// coord_t reverse_dim_idx = i / in_blk_size; +// i = i - reverse_dim_idx * in_blk_size; +// coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + +// (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + +// i; +// out_ptr[i] = in_ptr[in_idx]; +// } +// } + +/* I mentioned this earlier, but I still think the reverse_forward_kernel code + is incorrect, even though it matches the code in inference/master? Whenever + I'm testing the code and printing out the output, I'm getting unexpected + outputs, and I think it's a result of modifying the loop index i in the + previous code? +*/ __global__ void reverse_forward_kernel(float const *in_ptr, float *out_ptr, coord_t num_out_blks, @@ -28,12 +51,13 @@ __global__ void reverse_forward_kernel(float const *in_ptr, coord_t in_blk_size) { CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { coord_t blk_idx = i / (reverse_dim_size * in_blk_size); - i = i - blk_idx * (reverse_dim_size * in_blk_size); - coord_t reverse_dim_idx = i / in_blk_size; - i = i - reverse_dim_idx * in_blk_size; - coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + - (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i; - out_ptr[i] = in_ptr[in_idx]; + coord_t idx_within_blk = i % (reverse_dim_size * in_blk_size); + coord_t reverse_dim_idx = idx_within_blk / in_blk_size; + coord_t in_idx = idx_within_blk % in_blk_size; + coord_t input_index = + blk_idx * (reverse_dim_size * in_blk_size) + + (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + in_idx; + out_ptr[i] = in_ptr[input_index]; } } diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 1be5839a9c..b275f7ba83 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -75,4 +75,60 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); } } + + TEST_CASE("Check Cast Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100, 100}); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({100, 100}); + + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(output_shape); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(output_shape); + + // Only calling forward kernel as backward kernel is exactly the same + SUBCASE("forward_kernel") { + auto transform = [start_val = 1.1f, + counter = 0.0f](float input) mutable -> float { + return start_val + counter++; + }; + + // Run GPU Forward Kernel + GenericTensorAccessorW input_accessor_gpu = + create_transformed_accessor_w( + input_shape, gpu_allocator, transform, false); + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), + read_only_accessor_from_write_accessor(input_accessor_gpu), + output_accessor_gpu, + DataType::FLOAT, + DataType::INT32); + std::vector result_data_gpu = + load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_gpu), + true); + + // Run CPU Forward Kernel + GenericTensorAccessorW input_accessor_cpu = + create_transformed_accessor_w( + input_shape, cpu_allocator, transform, true); + Kernels::Cast::CPU::forward_kernel( + read_only_accessor_from_write_accessor(input_accessor_cpu), + output_accessor_cpu, + DataType::FLOAT, + DataType::INT32); + std::vector result_data_cpu = + load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_cpu), + false); + + CHECK(result_data_gpu == result_data_cpu); + } + } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 27223cc7b5..efe17db3f6 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -110,4 +110,90 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor_cpu)); } } + + TEST_CASE("Check Replicate Forward Kernel against CPU Kernel") { + std::size_t num_replicas = 10; + + // This should be like three shapes: pre_replication, replication shape, and + // reduced shape, but things are weird cause doesn't seem to be replicating + // anything + TensorShape input_shape = + make_tensor_shape_from_legion_dims({10, num_replicas}); + TensorShape replicated_shape = + make_tensor_shape_from_legion_dims({10, num_replicas}); + TensorShape reduced_shape = + make_tensor_shape_from_legion_dims({10}); + + ManagedPerDeviceFFHandle managed_handle{}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("forward_kernel") { + // Run GPU Replicate Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + read_only_accessor_from_write_accessor( + create_random_filled_accessor_w(input_shape, gpu_allocator)); + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(replicated_shape); + + Kernels::Replicate::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + std::vector result_data_gpu = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_gpu), true); + + // Run CPU Replicate Forward Kernel + GenericTensorAccessorW input_accessor_cpu = + copy_tensor_between_memories( + input_accessor_gpu, input_shape, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(replicated_shape); + + Kernels::Replicate::CPU::forward_kernel( + read_only_accessor_from_write_accessor(input_accessor_cpu), + output_accessor_cpu); + + std::vector result_data_cpu = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_cpu), false); + + CHECK(result_data_gpu == result_data_cpu); + } + + SUBCASE("backward_kernel") { + GenericTensorAccessorR output_grad_accessor_gpu = + read_only_accessor_from_write_accessor( + create_random_filled_accessor_w(replicated_shape, gpu_allocator)); + GenericTensorAccessorW input_grad_accessor_gpu = + gpu_allocator.allocate_tensor(reduced_shape); + + Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), + input_grad_accessor_gpu, + output_grad_accessor_gpu, + num_replicas); + + std::vector result_data_gpu = load_accessor_data( + read_only_accessor_from_write_accessor(input_grad_accessor_gpu), + true); + + GenericTensorAccessorW output_grad_accessor_cpu = + copy_tensor_between_memories( + output_grad_accessor_gpu, replicated_shape, cpu_allocator); + + GenericTensorAccessorW input_grad_accessor_cpu = + cpu_allocator.allocate_tensor(reduced_shape); + + Kernels::Replicate::CPU::backward_kernel( + input_grad_accessor_cpu, + read_only_accessor_from_write_accessor(output_grad_accessor_cpu), + num_replicas); + + std::vector result_data_cpu = load_accessor_data( + read_only_accessor_from_write_accessor(input_grad_accessor_cpu), + false); + + CHECK(result_data_gpu == result_data_cpu); + } + } } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 4adf79847a..e46f6b6dcb 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -136,4 +136,109 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor_cpu)); } } + + TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { + std::size_t num_out_blks = 2; + std::size_t reverse_dim_size = 3; + std::size_t in_blk_size = 5; + + TensorShape input_shape = + make_tensor_shape_from_legion_dims( + {num_out_blks, reverse_dim_size, in_blk_size}); + TensorShape output_shape = input_shape; + + ManagedPerDeviceFFHandle managed_handle{}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("forward_kernel") { + auto transform = [counter = 0.0f](float val) mutable { + return counter++; + }; + + // Run GPU Cast Forward Kernel + GenericTensorAccessorW input_accessor_gpu = + create_transformed_accessor_w( + input_shape, gpu_allocator, transform, false); + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(output_shape); + + Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), + input_accessor_gpu.get_float_ptr(), + output_accessor_gpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_accessor_gpu.shape.num_elements()); + + std::vector result_data_gpu = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_gpu), true); + + // Run CPU Cast Forward Kernel + GenericTensorAccessorW input_accessor_cpu = + create_transformed_accessor_w( + input_shape, cpu_allocator, transform, true); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(output_shape); + + Kernels::Reverse::CPU::forward_kernel( + input_accessor_cpu.get_float_ptr(), + output_accessor_cpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_accessor_cpu.shape.num_elements()); + + std::vector result_data_cpu = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_cpu), false); + + CHECK(result_data_gpu == result_data_cpu); + } + + SUBCASE("backward_kernel") { + // Run GPU Cast Backward Kernel + GenericTensorAccessorW output_grad_accessor_gpu = + create_random_filled_accessor_w(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + gpu_allocator.allocate_tensor(input_shape); + + Kernels::Reverse::backward_kernel( + managed_stream.raw_stream(), + output_grad_accessor_gpu.get_float_ptr(), + input_grad_accessor_gpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_grad_accessor_gpu.shape.num_elements()); + + std::vector result_data_gpu = load_accessor_data( + read_only_accessor_from_write_accessor(input_grad_accessor_gpu), + true); + + // Run CPU Cast Backward Kernel + GenericTensorAccessorW output_grad_accessor_cpu = + copy_tensor_between_memories( + read_only_accessor_from_write_accessor(output_grad_accessor_gpu), + output_shape, + cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + cpu_allocator.allocate_tensor(input_shape); + + Kernels::Reverse::CPU::backward_kernel( + output_grad_accessor_cpu.get_float_ptr(), + input_grad_accessor_cpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_grad_accessor_cpu.shape.num_elements()); + + std::vector result_data_cpu = load_accessor_data( + read_only_accessor_from_write_accessor(input_grad_accessor_cpu), + false); + + CHECK(result_data_gpu == result_data_cpu); + } + } } diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index 8a063fea17..5638b837b1 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -14,78 +14,99 @@ using namespace FlexFlow; -template -GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill = false) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); - std::vector
host_data(volume); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution
dist(-1.0f, 1.0f); - - for (auto &val : host_data) { - val = dist(gen); - } +enum class GpuDirection { + HostToDevice = 0, + DeviceToHost = 1, + DeviceToDevice = 2 +}; - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT)); +template +void transfer_memory(DT *dst, + const DT *src, + size_t num_elements, + GpuDirection gpu_dir, + bool cpu_memory) { + size_t bytes = num_elements * sizeof(DT); + + if (cpu_memory) { + memcpy(dst, src, bytes); } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(DT), - cudaMemcpyHostToDevice)); + switch (gpu_dir) { + case GpuDirection::HostToDevice: + checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice)); + break; + case GpuDirection::DeviceToHost: + checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost)); + break; + case GpuDirection::DeviceToDevice: + checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice)); + break; + } } - - return accessor; } +GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + bool on_host = false); + template GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, Allocator &allocator, DT val, - bool cpu_fill = false) { + bool on_host = false) { GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); size_t volume = accessor.shape.num_elements(); std::vector
host_data(volume, val); - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(DT), - cudaMemcpyHostToDevice)); - } + transfer_memory(static_cast
(accessor.ptr), + host_data.data(), + volume, + GpuDirection::HostToDevice, + on_host); return accessor; } -template -GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, +template +GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape, Allocator &allocator, - bool cpu_fill = false) { + F transform, + bool on_host = false) { GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); - std::vector
host_data(volume); + size_t volume = accessor.shape.get_volume(); + std::vector input_data(volume); + std::vector output_data(volume); - for (size_t i = 0; i < volume; i++) { - host_data[i] = i; - } + std::transform( + input_data.begin(), input_data.end(), output_data.begin(), transform); - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(DT), - cudaMemcpyHostToDevice)); - } + transfer_memory(static_cast(accessor.ptr), + output_data.data(), + volume, + GpuDirection::HostToDevice, + on_host); return accessor; } +template +GenericTensorAccessorW + copy_tensor_between_memories(GenericTensorAccessorR accessor, + TensorShape const &shape, + Allocator &allocator, + bool src_on_host = false) { + GenericTensorAccessorW copied_accessor = allocator.allocate_tensor(shape); + + size_t volume = accessor.shape.get_volume(); + GpuDirection gpu_dir = + src_on_host ? GpuDirection::HostToDevice : GpuDirection::DeviceToHost; + + transfer_memory( + copied_accessor.get
(), accessor.get
(), volume, gpu_dir, false); + + return copied_accessor; +} + template TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims) { return TensorShape{ @@ -96,15 +117,24 @@ TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims) { }; } -template -std::vector
load_data_to_host_from_device(GenericTensorAccessorR accessor) { +template +std::vector> load_accessor_data(GenericTensorAccessorR accessor, + bool on_device = true) { int volume = accessor.shape.get_volume(); - std::vector
local_data(volume); - checkCUDA(cudaMemcpy(local_data.data(), - accessor.ptr, - local_data.size() * sizeof(DT), - cudaMemcpyDeviceToHost)); + using T = real_type
; + std::vector local_data(volume); + T const *src_ptr = accessor.get
(); + + if (on_device) { + checkCUDA(cudaMemcpy(local_data.data(), + src_ptr, + volume * sizeof(T), + cudaMemcpyDeviceToHost)); + } else { + std::memcpy(local_data.data(), src_ptr, volume * sizeof(T)); + } + return local_data; } From 921fe6568cb6a415f22fa878c5759bc10eefbe57 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Sun, 14 Jul 2024 15:45:59 -0700 Subject: [PATCH 03/42] combine kernel --- lib/kernels/include/kernels/cast_kernels.h | 2 - lib/kernels/src/cpu/combine_kernels.cc | 44 +++++++++++++++++++ lib/kernels/src/cpu/replicate_kernels.cc | 2 +- lib/kernels/src/cpu/reverse_kernels.cc | 1 - lib/kernels/src/local_cpu_allocator.cc | 4 +- lib/kernels/src/local_cuda_allocator.cc | 1 + lib/kernels/test/src/test_cast_kernel.cc | 4 +- lib/kernels/test/src/test_replicate_kernel.cc | 13 +++--- lib/kernels/test/src/test_reverse_kernels.cc | 8 ++-- lib/kernels/test/src/test_utils.h | 12 ++--- 10 files changed, 65 insertions(+), 26 deletions(-) diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h index 96f9aadd52..502a823ca7 100644 --- a/lib/kernels/include/kernels/cast_kernels.h +++ b/lib/kernels/include/kernels/cast_kernels.h @@ -3,8 +3,6 @@ #include "device.h" #include "kernels/accessor.h" -#include "kernels/ff_handle.h" -#include "op-attrs/activation.dtg.h" namespace FlexFlow { namespace Kernels { diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc index e69de29bb2..f1950a56d2 100644 --- a/lib/kernels/src/cpu/combine_kernels.cc +++ b/lib/kernels/src/cpu/combine_kernels.cc @@ -0,0 +1,44 @@ +#include "kernels/combine_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow { +namespace Kernels { +namespace Combine { +namespace CPU { + +template +struct ForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + memcpy(output.get
(), + input.get
(), + input.shape.get_volume() * size_of_datatype(DT)); + } +}; + +template +struct BackwardKernel { + void operator()(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + size_t num_elements = output_grad.shape.get_volume(); + for (int i = 0; i < num_elements; ++i) { + input_grad.get
()[i] += output_grad.get
()[i]; + } + } +}; + +void forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, input, output); +} + +void backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + DataTypeDispatch1{}( + input_grad.data_type, output_grad, input_grad); +} + +} // namespace CPU +} // namespace Combine +} // namespace Kernels +} // namespace FlexFlow diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index 5f63d29691..a26d2054d1 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -14,7 +14,7 @@ void replicate_backward_kernel(T *input, for (size_t i = 0; i < num_elements; ++i) { T sum = 0; for (size_t j = 0; j < num_replicas; ++j) { - sum += output[j * num_elements + i]; + sum += output[i + j * num_elements]; } input[i] = sum; } diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index ac8ae26ca2..b035f03721 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -1,5 +1,4 @@ #include "kernels/reverse_kernels_cpu.h" -#include namespace FlexFlow { namespace Kernels { diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc index 6553dc2f88..9cc86c44ca 100644 --- a/lib/kernels/src/local_cpu_allocator.cc +++ b/lib/kernels/src/local_cpu_allocator.cc @@ -3,12 +3,14 @@ namespace FlexFlow { void *LocalCPUAllocator::allocate(size_t requested_memory_size) { - void *ptr = malloc(requested_memory_size); + void *ptr = calloc(1, requested_memory_size); + if (ptr != nullptr) { this->ptrs.insert(ptr); } else { throw std::bad_alloc(); } + return ptr; } diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc index cdcfb017a0..dad101c64c 100644 --- a/lib/kernels/src/local_cuda_allocator.cc +++ b/lib/kernels/src/local_cuda_allocator.cc @@ -6,6 +6,7 @@ namespace FlexFlow { void *LocalCudaAllocator::allocate(size_t requested_memory_size) { void *ptr; checkCUDA(cudaMalloc(&ptr, requested_memory_size)); + checkCUDA(cudaMemset(ptr, 0, requested_memory_size)); this->ptrs.insert(ptr); return ptr; } diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index b275f7ba83..b427b493b8 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -112,7 +112,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_gpu = load_accessor_data( read_only_accessor_from_write_accessor(output_accessor_gpu), - true); + false); // Run CPU Forward Kernel GenericTensorAccessorW input_accessor_cpu = @@ -126,7 +126,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_cpu = load_accessor_data( read_only_accessor_from_write_accessor(output_accessor_cpu), - false); + true); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index efe17db3f6..65f02f4bc9 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -116,7 +116,7 @@ TEST_SUITE(FF_TEST_SUITE) { // This should be like three shapes: pre_replication, replication shape, and // reduced shape, but things are weird cause doesn't seem to be replicating - // anything + // anything (ie. input shape should be same as reduced shape) TensorShape input_shape = make_tensor_shape_from_legion_dims({10, num_replicas}); TensorShape replicated_shape = @@ -142,7 +142,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), true); + read_only_accessor_from_write_accessor(output_accessor_gpu), false); // Run CPU Replicate Forward Kernel GenericTensorAccessorW input_accessor_cpu = @@ -156,12 +156,13 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor_cpu); std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), false); + read_only_accessor_from_write_accessor(output_accessor_cpu), true); CHECK(result_data_gpu == result_data_cpu); } SUBCASE("backward_kernel") { + // Run GPU Replicate Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = read_only_accessor_from_write_accessor( create_random_filled_accessor_w(replicated_shape, gpu_allocator)); @@ -175,12 +176,12 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_gpu = load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - true); + false); + // Run CPU Replicate Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = copy_tensor_between_memories( output_grad_accessor_gpu, replicated_shape, cpu_allocator); - GenericTensorAccessorW input_grad_accessor_cpu = cpu_allocator.allocate_tensor(reduced_shape); @@ -191,7 +192,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_cpu = load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - false); + true); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index e46f6b6dcb..f37bbba941 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -174,7 +174,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor_gpu.shape.num_elements()); std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), true); + read_only_accessor_from_write_accessor(output_accessor_gpu), false); // Run CPU Cast Forward Kernel GenericTensorAccessorW input_accessor_cpu = @@ -192,7 +192,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor_cpu.shape.num_elements()); std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), false); + read_only_accessor_from_write_accessor(output_accessor_cpu), true); CHECK(result_data_gpu == result_data_cpu); } @@ -215,7 +215,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_gpu = load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - true); + false); // Run CPU Cast Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = @@ -236,7 +236,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_cpu = load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - false); + true); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index 5638b837b1..80720801b6 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -119,21 +119,15 @@ TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims) { template std::vector> load_accessor_data(GenericTensorAccessorR accessor, - bool on_device = true) { + bool on_host = false) { int volume = accessor.shape.get_volume(); using T = real_type
; std::vector local_data(volume); T const *src_ptr = accessor.get
(); - if (on_device) { - checkCUDA(cudaMemcpy(local_data.data(), - src_ptr, - volume * sizeof(T), - cudaMemcpyDeviceToHost)); - } else { - std::memcpy(local_data.data(), src_ptr, volume * sizeof(T)); - } + transfer_memory( + local_data.data(), src_ptr, volume, GpuDirection::DeviceToHost, on_host); return local_data; } From 4ca67aa7549d00a0aff3e745ff242e1bea47d3e4 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Sun, 14 Jul 2024 15:58:40 -0700 Subject: [PATCH 04/42] combine kernels .h file --- .../include/kernels/combine_kernels_cpu.h | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 lib/kernels/include/kernels/combine_kernels_cpu.h diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h new file mode 100644 index 0000000000..1d30297af1 --- /dev/null +++ b/lib/kernels/include/kernels/combine_kernels_cpu.h @@ -0,0 +1,23 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H + +#include "device.h" +#include "kernels/accessor.h" + +namespace FlexFlow { +namespace Kernels { +namespace Combine { +namespace CPU { + +void forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); + +} // namespace CPU +} // namespace Combine +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H From 86edf2e4aa0d2a34d7135988efca1ad3dff10826 Mon Sep 17 00:00:00 2001 From: Marsella8 <45826022+Marsella8@users.noreply.github.com> Date: Thu, 18 Jul 2024 17:38:22 -0700 Subject: [PATCH 05/42] Implementations for methods for machine_views and associated modules (#1429) * initial commit for machine view adjacent modules * Formatting * Tests for new machine_view.cc functions * formatting * Minor Test correction * formatting * PR fixes * PR Fixes --------- Co-authored-by: Pietro Max Marsella --- lib/pcg/include/pcg/strided_rectangle.h | 17 +++++ lib/pcg/src/pcg/strided_rectangle_side.cc | 17 +++++ lib/pcg/src/strided_rectangle.cc | 35 ++++++++++ lib/pcg/test/src/test_machine_view.cc | 74 ++++++++++++++++++++++ lib/pcg/test/src/test_strided_rectangle.cc | 37 +++++++++++ 5 files changed, 180 insertions(+) create mode 100644 lib/pcg/include/pcg/strided_rectangle.h create mode 100644 lib/pcg/src/pcg/strided_rectangle_side.cc create mode 100644 lib/pcg/src/strided_rectangle.cc create mode 100644 lib/pcg/test/src/test_machine_view.cc create mode 100644 lib/pcg/test/src/test_strided_rectangle.cc diff --git a/lib/pcg/include/pcg/strided_rectangle.h b/lib/pcg/include/pcg/strided_rectangle.h new file mode 100644 index 0000000000..9c3b8eeda9 --- /dev/null +++ b/lib/pcg/include/pcg/strided_rectangle.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_PCG_INCLUDE_PCG_STRIDED_RECTANGLE_H +#define _FLEXFLOW_PCG_INCLUDE_PCG_STRIDED_RECTANGLE_H + +#include "op-attrs/ff_dim.dtg.h" +#include "pcg/side_size_t.dtg.h" +#include "pcg/strided_rectangle.dtg.h" + +namespace FlexFlow { + +size_t get_num_dims(StridedRectangle const &); +StridedRectangleSide get_side_at_idx(StridedRectangle const &rect, + ff_dim_t const &idx); +num_points_t get_num_points(StridedRectangle const &rect); + +} // namespace FlexFlow + +#endif diff --git a/lib/pcg/src/pcg/strided_rectangle_side.cc b/lib/pcg/src/pcg/strided_rectangle_side.cc new file mode 100644 index 0000000000..e6caf4cb86 --- /dev/null +++ b/lib/pcg/src/pcg/strided_rectangle_side.cc @@ -0,0 +1,17 @@ +#include "pcg/strided_rectangle_side.h" +#include "utils/exception.h" + +namespace FlexFlow { + +StridedRectangleSide strided_side_from_size_and_stride(side_size_t side_size, + int stride) { + assert((side_size.unwrapped % stride) == 0); + return StridedRectangleSide{num_points_t{side_size.unwrapped / stride}, + stride}; +} + +side_size_t get_side_size(StridedRectangleSide const &s) { + return side_size_t{s.num_points.unwrapped * s.stride}; +} + +} // namespace FlexFlow diff --git a/lib/pcg/src/strided_rectangle.cc b/lib/pcg/src/strided_rectangle.cc new file mode 100644 index 0000000000..1c61424ab9 --- /dev/null +++ b/lib/pcg/src/strided_rectangle.cc @@ -0,0 +1,35 @@ +#include "pcg/strided_rectangle.h" +#include "op-attrs/dim_ordered/transform.h" +#include "utils/containers.h" + +namespace FlexFlow { + +/* size_t StridedRectangle::at(FFOrdered const &coord) const { */ +/* assert(coord.size() == this->num_dims()); */ + +/* size_t _1d_stride = 1; */ +/* size_t idx = 0; */ +/* for (auto dim : inner_to_outer_idxs(this->sides)) { */ +/* idx += this->sides.at(dim).at(coord.at(dim)).value() * _1d_stride; */ +/* _1d_stride *= this->sides.at(dim).get_size().value(); */ +/* } */ +/* return idx; */ +/* } */ + +size_t get_num_dims(StridedRectangle const &rect) { + return rect.sides.size(); +} + +num_points_t get_num_points(StridedRectangle const &rect) { + return num_points_t{ + product(transform(rect.sides, [](StridedRectangleSide const &side) { + return side.num_points.unwrapped; + }))}; +} + +StridedRectangleSide get_side_at_idx(StridedRectangle const &rect, + ff_dim_t const &idx) { + return rect.sides.at(idx); +} + +} // namespace FlexFlow diff --git a/lib/pcg/test/src/test_machine_view.cc b/lib/pcg/test/src/test_machine_view.cc new file mode 100644 index 0000000000..92a96d5e9a --- /dev/null +++ b/lib/pcg/test/src/test_machine_view.cc @@ -0,0 +1,74 @@ +#include "doctest/doctest.h" +#include "pcg/machine_view.h" +#include "pcg/strided_rectangle.h" +#include "pcg/strided_rectangle_side.h" + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("MachineView general util functions") { + StridedRectangle rect{{StridedRectangleSide{num_points_t{7}, 5}, + StridedRectangleSide{num_points_t{10}, 2}}}; + gpu_id_t start(1); + MachineView mv{device_id_t{start}, rect}; + SUBCASE("num_dims") { + CHECK(num_dims(mv) == 2); + } + SUBCASE("num_devices") { + CHECK(num_devices(mv) == 7 * 10); + } + SUBCASE("get_device_type") { + CHECK(get_device_type(mv) == DeviceType::GPU); + } + } + + TEST_CASE("MachineView make_1d_machine_view - GPU") { + StridedRectangle rect{{StridedRectangleSide{num_points_t{7}, 5}}}; + device_id_t start_gpu{gpu_id_t{1}}; + MachineView gpu_mv{start_gpu, rect}; + + SUBCASE("make_1d_machine_view(gpu_id_t start, gpu_id_t stop, int stride)") { + MachineView result = + make_1d_machine_view(start_gpu, device_id_t{gpu_id_t(1 + 7 * 5)}, 5); + MachineView correct = gpu_mv; + CHECK(result == correct); + } + SUBCASE("make_1d_machine_view(gpu_id_t start, num_points_t num_points, int " + "stride)") { + MachineView result = make_1d_machine_view(start_gpu, num_points_t{7}, 5); + MachineView correct = gpu_mv; + CHECK(result == correct); + } + SUBCASE("make_1d_machine_view(gpu_id_t start, side_size_t interval_size, " + "int stride)") { + MachineView result = make_1d_machine_view( + start_gpu, get_side_size(rect.sides.at(ff_dim_t{0})), 5); + MachineView correct = gpu_mv; + CHECK(result == correct); + } + } + + TEST_CASE("MachineView make_1d_machine_view - CPU") { + StridedRectangle rect{{StridedRectangleSide{num_points_t{11}, 4}}}; + device_id_t start_cpu{cpu_id_t{2}}; + MachineView cpu_mv{start_cpu, rect}; + + SUBCASE("make_1d_machine_view(cpu_id_t start, cpu_id_t stop, int stride)") { + MachineView result = + make_1d_machine_view(start_cpu, device_id_t{cpu_id_t(2 + 11 * 4)}, 4); + MachineView correct = cpu_mv; + CHECK(result == correct); + } + SUBCASE("make_1d_machine_view(cpu_id_t start, num_points_t num_points, int " + "stride)") { + MachineView result = make_1d_machine_view(start_cpu, num_points_t{11}, 4); + MachineView correct = cpu_mv; + CHECK(result == correct); + } + SUBCASE("make_1d_machine_view(cpu_id_t start, side_size_t interval_size, " + "int stride)") { + MachineView result = make_1d_machine_view( + start_cpu, get_side_size(rect.sides.at(ff_dim_t{0})), 4); + MachineView correct = cpu_mv; + CHECK(result == correct); + } + } +} diff --git a/lib/pcg/test/src/test_strided_rectangle.cc b/lib/pcg/test/src/test_strided_rectangle.cc new file mode 100644 index 0000000000..ef342944de --- /dev/null +++ b/lib/pcg/test/src/test_strided_rectangle.cc @@ -0,0 +1,37 @@ +#include "doctest/doctest.h" +#include "pcg/strided_rectangle.h" +#include "pcg/strided_rectangle_side.h" + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("get_side_size(StridedRectangleSide)") { + StridedRectangleSide side{num_points_t{7}, 5}; + + CHECK(get_side_size(side) == side_size_t{7 * 5}); + } + TEST_CASE("strided_side_from_size_and_stride") { + StridedRectangleSide correct{num_points_t{10}, 3}; + StridedRectangleSide result = + strided_side_from_size_and_stride(side_size_t{10 * 3}, 3); + CHECK(result == correct); + } + + TEST_CASE("StridedRectangle - helper functions") { + + StridedRectangleSide s0{num_points_t{7}, 5}; + StridedRectangleSide s1{num_points_t{10}, 2}; + StridedRectangleSide s2{num_points_t{8}, 1}; + StridedRectangle rect{{s0, s1, s2}}; + + SUBCASE("get_num_dims") { + CHECK(get_num_dims(rect) == 3); + } + SUBCASE("get_num_points") { + CHECK(get_num_points(rect) == num_points_t{7 * 8 * 10}); + } + SUBCASE("get_side_at_idx") { + CHECK(get_side_at_idx(rect, ff_dim_t{0}) == s0); + CHECK(get_side_at_idx(rect, ff_dim_t{1}) == s1); + CHECK(get_side_at_idx(rect, ff_dim_t{2}) == s2); + } + } +} From d9af610c5f940b1c06455fb938d7e589abaf712b Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Wed, 31 Jul 2024 04:49:13 -0700 Subject: [PATCH 06/42] test utils logic cleanup, reverse cpu_kernel pedagogical implmentation, other minor fixes --- lib/kernels/include/kernels/accessor.h | 26 ++- lib/kernels/include/kernels/allocation.h | 9 + .../include/kernels/cast_kernels_cpu.h | 18 +- .../include/kernels/combine_kernels_cpu.h | 10 +- .../include/kernels/local_cpu_allocator.h | 1 + .../include/kernels/local_cuda_allocator.h | 1 + .../include/kernels/replicate_kernels_cpu.h | 12 +- .../include/kernels/reverse_kernels_cpu.h | 26 ++- lib/kernels/src/accessor.cc | 6 +- lib/kernels/src/allocation.cc | 14 +- lib/kernels/src/cpu/cast_kernels.cc | 35 ++-- lib/kernels/src/cpu/combine_kernels.cc | 18 +- lib/kernels/src/cpu/replicate_kernels.cc | 30 ++-- lib/kernels/src/cpu/reverse_kernels.cc | 88 ++++++---- lib/kernels/src/local_cpu_allocator.cc | 18 +- lib/kernels/src/local_cuda_allocator.cc | 11 +- lib/kernels/test/src/test_attention_kernel.cc | 26 ++- .../test/src/test_batch_matmul_kernel.cc | 12 +- .../test/src/test_batch_norm_kernel.cc | 15 +- lib/kernels/test/src/test_cast_kernel.cc | 25 +-- lib/kernels/test/src/test_dropout.cc | 6 +- lib/kernels/test/src/test_gather_kernels.cc | 3 +- .../test/src/test_layer_norm_kernels.cc | 3 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 6 +- lib/kernels/test/src/test_replicate_kernel.cc | 46 +++-- lib/kernels/test/src/test_reverse_kernels.cc | 38 ++--- lib/kernels/test/src/test_softmax_kernel.cc | 6 +- lib/kernels/test/src/test_split_kernel.cc | 6 +- lib/kernels/test/src/test_transpose_kernel.cc | 3 +- lib/kernels/test/src/test_utils.h | 160 +++++++++++------- .../local-execution/tracked_allocator.h | 1 + lib/local-execution/src/tracked_allocator.cc | 10 +- 32 files changed, 417 insertions(+), 272 deletions(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 39da65c3be..e30e1fe825 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -28,15 +28,20 @@ class GenericTensorAccessorW { double *get_double_ptr() const; half *get_half_ptr() const; + GenericTensorAccessorW(DataType dt, + ArrayShape sh, + req p, + bool on_dev = true) + : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {} + public: DataType data_type; ArrayShape shape; req ptr; + bool on_device; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW, - data_type, - shape, - ptr); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( + GenericTensorAccessorW, data_type, shape, ptr, on_device); std::string format_as(GenericTensorAccessorW const &); std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); @@ -59,15 +64,20 @@ class GenericTensorAccessorR { double const *get_double_ptr() const; half const *get_half_ptr() const; + GenericTensorAccessorR(DataType dt, + ArrayShape sh, + req p, + bool on_dev = true) + : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {} + public: DataType data_type; ArrayShape shape; req ptr; + bool on_device; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR, - data_type, - shape, - ptr); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( + GenericTensorAccessorR, data_type, shape, ptr, on_device); std::string format_as(GenericTensorAccessorR const &); std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h index 6500899394..452ccc47b0 100644 --- a/lib/kernels/include/kernels/allocation.h +++ b/lib/kernels/include/kernels/allocation.h @@ -5,10 +5,13 @@ #include #include +enum class AllocLocation { HOST, DEVICE }; + namespace FlexFlow { struct IAllocator { virtual void *allocate(size_t) = 0; + virtual void *allocate_and_zero(size_t) = 0; virtual void deallocate(void *) = 0; virtual ~IAllocator() = default; @@ -18,7 +21,11 @@ struct Allocator { Allocator() = delete; GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape); + GenericTensorAccessorW + allocate_tensor_and_zero(TensorShape const &tensor_shape); + void *allocate(size_t mem_size); + void *allocate_and_zero(size_t mem_size); void deallocate(void *ptr); template @@ -30,6 +37,8 @@ struct Allocator { Allocator(std::shared_ptr ptr) : i_allocator(ptr){}; + AllocLocation alloc_location; + private: std::shared_ptr i_allocator; }; diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h index df4ef22b93..cae0c9da8d 100644 --- a/lib/kernels/include/kernels/cast_kernels_cpu.h +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -7,19 +7,17 @@ namespace FlexFlow { namespace Kernels { namespace Cast { -namespace CPU { -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type); -void backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); +void cpu_backward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type); -} // namespace CPU } // namespace Cast } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h index 1d30297af1..66c22ddbf8 100644 --- a/lib/kernels/include/kernels/combine_kernels_cpu.h +++ b/lib/kernels/include/kernels/combine_kernels_cpu.h @@ -7,15 +7,13 @@ namespace FlexFlow { namespace Kernels { namespace Combine { -namespace CPU { -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); -void backward_kernel(GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad); +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); -} // namespace CPU } // namespace Combine } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h index 27dcc9d854..121ed184e9 100644 --- a/lib/kernels/include/kernels/local_cpu_allocator.h +++ b/lib/kernels/include/kernels/local_cpu_allocator.h @@ -10,6 +10,7 @@ struct LocalCPUAllocator : public IAllocator { ~LocalCPUAllocator() override; void *allocate(size_t) override; + void *allocate_and_zero(size_t) override; void deallocate(void *) override; private: diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h index 18a4b6e78a..16f60daead 100644 --- a/lib/kernels/include/kernels/local_cuda_allocator.h +++ b/lib/kernels/include/kernels/local_cuda_allocator.h @@ -10,6 +10,7 @@ struct LocalCudaAllocator : public IAllocator { ~LocalCudaAllocator() override; void *allocate(size_t) override; + void *allocate_and_zero(size_t) override; void deallocate(void *) override; private: diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h index 4bc97f00ef..11d2f1bf5c 100644 --- a/lib/kernels/include/kernels/replicate_kernels_cpu.h +++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h @@ -7,16 +7,14 @@ namespace FlexFlow { namespace Kernels { namespace Replicate { -namespace CPU { -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); -void backward_kernel(GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output, - size_t num_replicas); +void cpu_backward_kernel(GenericTensorAccessorW const &input, + GenericTensorAccessorR const &output, + size_t num_replicas); -} // namespace CPU } // namespace Replicate } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h index 89ed6ffdb4..bb17aa9400 100644 --- a/lib/kernels/include/kernels/reverse_kernels_cpu.h +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -6,22 +6,20 @@ namespace FlexFlow { namespace Kernels { namespace Reverse { -namespace CPU { -void forward_kernel(float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size); +void cpu_forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t output_size); -void backward_kernel(float const *out_grad_ptr, - float *in_grad_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t input_size); -} // namespace CPU +void cpu_backward_kernel(float const *out_grad_ptr, + float *in_grad_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t input_size); } // namespace Reverse } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index 27b7eb390d..66d3c02300 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -156,8 +156,10 @@ std::vector GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &writable) { - return GenericTensorAccessorR{ - writable.data_type, writable.shape, req(writable.ptr)}; + return GenericTensorAccessorR{writable.data_type, + writable.shape, + req(writable.ptr), + writable.on_device}; } bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index ccd88580db..ce06fbabe0 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -7,6 +7,10 @@ void *Allocator::allocate(size_t mem_size) { return this->i_allocator->allocate(mem_size); } +void *Allocator::allocate_and_zero(size_t mem_size) { + return this->i_allocator->allocate_and_zero(mem_size); +} + void Allocator::deallocate(void *ptr) { this->i_allocator->deallocate(ptr); } @@ -14,7 +18,15 @@ void Allocator::deallocate(void *ptr) { GenericTensorAccessorW Allocator::allocate_tensor(TensorShape const &tensor_shape) { void *ptr = this->allocate(get_size_in_bytes(tensor_shape)); - return {tensor_shape.data_type, tensor_shape, ptr}; + bool on_device = this->alloc_location == AllocLocation::DEVICE; + return {tensor_shape.data_type, tensor_shape, ptr, on_device}; +} + +GenericTensorAccessorW + Allocator::allocate_tensor_and_zero(TensorShape const &tensor_shape) { + void *ptr = this->allocate_and_zero(get_size_in_bytes(tensor_shape)); + bool on_device = this->alloc_location == AllocLocation::DEVICE; + return {tensor_shape.data_type, tensor_shape, ptr, on_device}; } } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc index cf73a84b93..5888d9a96a 100644 --- a/lib/kernels/src/cpu/cast_kernels.cc +++ b/lib/kernels/src/cpu/cast_kernels.cc @@ -4,56 +4,55 @@ namespace FlexFlow { namespace Kernels { namespace Cast { -namespace CPU { template -void cast_forward(IDT const *input, ODT *output, size_t volume) { +void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) { for (size_t i = 0; i < volume; ++i) { output[i] = static_cast(input[i]); } } template -void cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) { +void cpu_cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) { for (size_t i = 0; i < volume; i++) { output[i] = static_cast(input[i]) + beta * output[i]; } } template -struct ForwardKernel { +struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { size_t volume = input.shape.get_volume(); - cast_forward(input.get(), output.get(), volume); + cpu_cast_forward(input.get(), output.get(), volume); } }; template -struct BackwardKernel { +struct CPUBackwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { size_t volume = input.shape.get_volume(); - cast_backward( + cpu_cast_backward( input.get(), output.get(), volume, cast_to(1.0f)); } }; -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { - DataTypeDispatch2{}(input_type, output_type, input, output); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type) { + DataTypeDispatch2{}(input_type, output_type, input, output); } -void backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { - DataTypeDispatch2{}(input_type, output_type, input, output); +void cpu_backward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type) { + DataTypeDispatch2{}( + input_type, output_type, input, output); } -} // namespace CPU } // namespace Cast } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc index f1950a56d2..e48f4c3e01 100644 --- a/lib/kernels/src/cpu/combine_kernels.cc +++ b/lib/kernels/src/cpu/combine_kernels.cc @@ -4,10 +4,9 @@ namespace FlexFlow { namespace Kernels { namespace Combine { -namespace CPU { template -struct ForwardKernel { +struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { memcpy(output.get
(), @@ -17,7 +16,7 @@ struct ForwardKernel { }; template -struct BackwardKernel { +struct CPUBackwardKernel { void operator()(GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad) { size_t num_elements = output_grad.shape.get_volume(); @@ -27,18 +26,17 @@ struct BackwardKernel { } }; -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(input.data_type, input, output); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, input, output); } -void backward_kernel(GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad) { - DataTypeDispatch1{}( +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + DataTypeDispatch1{}( input_grad.data_type, output_grad, input_grad); } -} // namespace CPU } // namespace Combine } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index a26d2054d1..239baf4041 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -4,13 +4,12 @@ namespace FlexFlow { namespace Kernels { namespace Replicate { -namespace CPU { template -void replicate_backward_kernel(T *input, - T const *output, - size_t num_elements, - size_t num_replicas) { +void cpu_replicate_backward_kernel(T *input, + T const *output, + size_t num_elements, + size_t num_replicas) { for (size_t i = 0; i < num_elements; ++i) { T sum = 0; for (size_t j = 0; j < num_replicas; ++j) { @@ -23,7 +22,7 @@ void replicate_backward_kernel(T *input, // Why does replicate forward seem to only transfer memory? Shouldn't it also // handle the replication? template -struct ForwardKernel { +struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { memcpy(output.get(), @@ -33,29 +32,28 @@ struct ForwardKernel { }; template -struct BackwardKernel { +struct CPUBackwardKernel { void operator()(GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, size_t num_replicas) { size_t total_elements = input.shape.num_elements() * num_replicas; - replicate_backward_kernel( + cpu_replicate_backward_kernel( input.get(), output.get(), total_elements, num_replicas); } }; -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(input.data_type, input, output); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, input, output); } -void backward_kernel(GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output, - size_t num_replicas) { - DataTypeDispatch1{}( +void cpu_backward_kernel(GenericTensorAccessorW const &input, + GenericTensorAccessorR const &output, + size_t num_replicas) { + DataTypeDispatch1{}( input.data_type, input, output, num_replicas); } -} // namespace CPU } // namespace Replicate } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index b035f03721..350dad03e9 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -1,48 +1,78 @@ #include "kernels/reverse_kernels_cpu.h" +#include +#include namespace FlexFlow { namespace Kernels { namespace Reverse { -namespace CPU { -void reverse_forward_kernel(float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size) { +void cpu_reverse_forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size) { coord_t total_elements = num_out_blks * reverse_dim_size * in_blk_size; - for (coord_t i = 0; i < total_elements; ++i) { - coord_t blk_idx = i / (reverse_dim_size * in_blk_size); - coord_t offset = i - blk_idx * (reverse_dim_size * in_blk_size); - coord_t reverse_dim_idx = offset / in_blk_size; - coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + - (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + - (offset % in_blk_size); - out_ptr[i] = in_ptr[in_idx]; + + std::vector> in_blocks(num_out_blks * reverse_dim_size, + std::vector(in_blk_size)); + + // For each output block, copy the input block into in_blocks + for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { + // Each output block has reverse_dim_size input blocks + for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) { + coord_t start_idx = (blk_idx * reverse_dim_size + rev_idx) * in_blk_size; + + // Copy elements from in_ptr to the current block in in_blocks + std::vector ¤t_block = + in_blocks[blk_idx * reverse_dim_size + rev_idx]; + for (coord_t i = 0; i < in_blk_size; ++i) { + current_block[i] = in_ptr[start_idx + i]; + } + } + } + + // Reverse the in_blocks within each output block + for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { + auto block_start = in_blocks.begin() + blk_idx * reverse_dim_size; + auto block_end = block_start + reverse_dim_size; + std::reverse(block_start, block_end); + } + + // Copy the reversed blocks to the output array + for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { + for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) { + coord_t start_idx = (blk_idx * reverse_dim_size + rev_idx) * in_blk_size; + + // Copy elements from the current block in in_blocks to out_ptr + std::vector const ¤t_block = + in_blocks[blk_idx * reverse_dim_size + rev_idx]; + for (coord_t i = 0; i < in_blk_size; ++i) { + out_ptr[start_idx + i] = current_block[i]; + } + } } } -void forward_kernel(float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size) { - reverse_forward_kernel( +void cpu_forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t output_size) { + cpu_reverse_forward_kernel( in_ptr, out_ptr, num_out_blks, reverse_dim_size, in_blk_size); } -void backward_kernel(float const *out_grad_ptr, - float *in_grad_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t input_size) { - reverse_forward_kernel( +void cpu_backward_kernel(float const *out_grad_ptr, + float *in_grad_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t input_size) { + cpu_reverse_forward_kernel( out_grad_ptr, in_grad_ptr, num_out_blks, reverse_dim_size, in_blk_size); } -} // namespace CPU } // namespace Reverse } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc index 9cc86c44ca..ced707edcc 100644 --- a/lib/kernels/src/local_cpu_allocator.cc +++ b/lib/kernels/src/local_cpu_allocator.cc @@ -3,6 +3,18 @@ namespace FlexFlow { void *LocalCPUAllocator::allocate(size_t requested_memory_size) { + void *ptr = malloc(requested_memory_size); + + if (ptr != nullptr) { + this->ptrs.insert(ptr); + } else { + throw std::bad_alloc(); + } + + return ptr; +} + +void *LocalCPUAllocator::allocate_and_zero(size_t requested_memory_size) { void *ptr = calloc(1, requested_memory_size); if (ptr != nullptr) { @@ -25,13 +37,15 @@ void LocalCPUAllocator::deallocate(void *ptr) { } LocalCPUAllocator::~LocalCPUAllocator() { - for (auto ptr : ptrs) { + for (void *ptr : this->ptrs) { free(ptr); } } Allocator create_local_cpu_memory_allocator() { - return Allocator::create(); + Allocator allocator = Allocator::create(); + allocator.alloc_location = AllocLocation::HOST; + return allocator; } } // namespace FlexFlow diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc index dad101c64c..b6c615a5ca 100644 --- a/lib/kernels/src/local_cuda_allocator.cc +++ b/lib/kernels/src/local_cuda_allocator.cc @@ -4,6 +4,13 @@ namespace FlexFlow { void *LocalCudaAllocator::allocate(size_t requested_memory_size) { + void *ptr; + checkCUDA(cudaMalloc(&ptr, requested_memory_size)); + this->ptrs.insert(ptr); + return ptr; +} + +void *LocalCudaAllocator::allocate_and_zero(size_t requested_memory_size) { void *ptr; checkCUDA(cudaMalloc(&ptr, requested_memory_size)); checkCUDA(cudaMemset(ptr, 0, requested_memory_size)); @@ -28,7 +35,9 @@ LocalCudaAllocator::~LocalCudaAllocator() { } Allocator create_local_cuda_memory_allocator() { - return Allocator::create(); + Allocator allocator = Allocator::create(); + allocator.alloc_location = AllocLocation::DEVICE; + return allocator; } } // namespace FlexFlow diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index 023233ecb0..c4a3f7bd50 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -47,13 +47,16 @@ TEST_SUITE(FF_TEST_SUITE) { make_tensor_shape_from_legion_dims({state.weightSize}, DataType::FLOAT); GenericTensorAccessorW query_accessor = - create_random_filled_accessor_w(query_shape, allocator); + create_random_filled_accessor_w(query_shape, + allocator); GenericTensorAccessorW key_accessor = - create_random_filled_accessor_w(key_shape, allocator); + create_random_filled_accessor_w(key_shape, allocator); GenericTensorAccessorW value_accessor = - create_random_filled_accessor_w(value_shape, allocator); + create_random_filled_accessor_w(value_shape, + allocator); GenericTensorAccessorW weight_accessor = - create_random_filled_accessor_w(weight_shape, allocator); + create_random_filled_accessor_w(weight_shape, + allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -73,15 +76,20 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW query_grad_accessor = - create_random_filled_accessor_w(query_shape, allocator); + create_random_filled_accessor_w(query_shape, + allocator); GenericTensorAccessorW key_grad_accessor = - create_random_filled_accessor_w(key_shape, allocator); + create_random_filled_accessor_w(key_shape, + allocator); GenericTensorAccessorW value_grad_accessor = - create_random_filled_accessor_w(value_shape, allocator); + create_random_filled_accessor_w(value_shape, + allocator); GenericTensorAccessorW weight_grad_accessor = - create_random_filled_accessor_w(weight_shape, allocator); + create_random_filled_accessor_w(weight_shape, + allocator); GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); Kernels::MultiHeadAttention::backward_kernel( managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index 8a11a069f5..bb9c4c07bd 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -29,11 +29,14 @@ TEST_SUITE(FF_TEST_SUITE) { make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT); GenericTensorAccessorW a_accessor = - create_random_filled_accessor_w(input_shape_a, allocator); + create_random_filled_accessor_w(input_shape_a, + allocator); GenericTensorAccessorW b_accessor = - create_random_filled_accessor_w(input_shape_b, allocator); + create_random_filled_accessor_w(input_shape_b, + allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); SUBCASE("forward_kernel") { Kernels::BatchMatmul::forward_kernel(managed_stream.raw_stream(), @@ -52,7 +55,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW o_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW a_grad_accessor = allocator.allocate_tensor(input_shape_a); GenericTensorAccessorW b_grad_accessor = diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 03a3a1ad40..43bcc5528a 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -36,7 +36,8 @@ TEST_SUITE(FF_TEST_SUITE) { {output_n, output_c, output_h, output_w}, DataType::FLOAT); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW scale_accessor = create_filled_accessor_w( @@ -58,13 +59,17 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW scale_grad_accessor = - create_random_filled_accessor_w(scale_shape, allocator); + create_random_filled_accessor_w(scale_shape, + allocator); GenericTensorAccessorW bias_grad_accessor = - create_random_filled_accessor_w(bias_shape, allocator); + create_random_filled_accessor_w(bias_shape, + allocator); Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index b427b493b8..a6990d2ad0 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -83,9 +83,9 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::INT32); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(output_shape); @@ -102,31 +102,34 @@ TEST_SUITE(FF_TEST_SUITE) { // Run GPU Forward Kernel GenericTensorAccessorW input_accessor_gpu = create_transformed_accessor_w( - input_shape, gpu_allocator, transform, false); + input_shape, gpu_allocator, transform); Kernels::Cast::forward_kernel( managed_stream.raw_stream(), read_only_accessor_from_write_accessor(input_accessor_gpu), output_accessor_gpu, DataType::FLOAT, DataType::INT32); + std::cout << "Before GPU load" << std::endl; std::vector result_data_gpu = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), - false); + load_accessor_data(output_accessor_gpu); // Run CPU Forward Kernel GenericTensorAccessorW input_accessor_cpu = create_transformed_accessor_w( - input_shape, cpu_allocator, transform, true); - Kernels::Cast::CPU::forward_kernel( + input_shape, cpu_allocator, transform); + Kernels::Cast::cpu_forward_kernel( read_only_accessor_from_write_accessor(input_accessor_cpu), output_accessor_cpu, DataType::FLOAT, DataType::INT32); + std::cout << "Before CPU load" << std::endl; + if (output_accessor_cpu.on_device) { + std::cout << "CPU data is on device" << std::endl; + } else { + std::cout << "CPU data is on host" << std::endl; + } std::vector result_data_cpu = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), - true); + load_accessor_data(output_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 4be2bdf7bb..7ff364bada 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -47,9 +47,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_data = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW input_grad_data = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Dropout::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 7f97563217..4f05c89813 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -41,7 +41,8 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Gather::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 7d7298f83d..3ac0e1425f 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -56,7 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW gamma_grad_accessor = allocator.allocate_tensor(feature_shape); GenericTensorAccessorW beta_grad_accessor = diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 00fa968235..f71d9cfa11 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -45,9 +45,11 @@ TEST_SUITE(FF_TEST_SUITE) { {output_w, output_h, output_c, output_n}, DataType::FLOAT); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); SUBCASE("forward_kernel") { Kernels::Pool2D::forward_kernel(managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 65f02f4bc9..e952f1107f 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -118,11 +118,11 @@ TEST_SUITE(FF_TEST_SUITE) { // reduced shape, but things are weird cause doesn't seem to be replicating // anything (ie. input shape should be same as reduced shape) TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, num_replicas}); + make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT); TensorShape replicated_shape = - make_tensor_shape_from_legion_dims({10, num_replicas}); + make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT); TensorShape reduced_shape = - make_tensor_shape_from_legion_dims({10}); + make_tensor_shape_from_legion_dims({10}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -133,30 +133,30 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { // Run GPU Replicate Forward Kernel GenericTensorAccessorR input_accessor_gpu = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, gpu_allocator)); + create_random_filled_accessor_r(input_shape, + gpu_allocator); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(replicated_shape); Kernels::Replicate::forward_kernel( managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), false); + std::vector result_data_gpu = + load_accessor_data(output_accessor_gpu); // Run CPU Replicate Forward Kernel GenericTensorAccessorW input_accessor_cpu = - copy_tensor_between_memories( - input_accessor_gpu, input_shape, cpu_allocator); + copy_tensor_between_memories(input_accessor_gpu, + cpu_allocator); GenericTensorAccessorW output_accessor_cpu = cpu_allocator.allocate_tensor(replicated_shape); - Kernels::Replicate::CPU::forward_kernel( + Kernels::Replicate::cpu_forward_kernel( read_only_accessor_from_write_accessor(input_accessor_cpu), output_accessor_cpu); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), true); + std::vector result_data_cpu = + load_accessor_data(output_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } @@ -164,35 +164,33 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { // Run GPU Replicate Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(replicated_shape, gpu_allocator)); + create_random_filled_accessor_r(replicated_shape, + gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor(reduced_shape); + gpu_allocator.allocate_tensor_and_zero(reduced_shape); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), input_grad_accessor_gpu, output_grad_accessor_gpu, num_replicas); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - false); + std::vector result_data_gpu = + load_accessor_data(input_grad_accessor_gpu); // Run CPU Replicate Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = copy_tensor_between_memories( - output_grad_accessor_gpu, replicated_shape, cpu_allocator); + output_grad_accessor_gpu, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor(reduced_shape); + cpu_allocator.allocate_tensor_and_zero(reduced_shape); - Kernels::Replicate::CPU::backward_kernel( + Kernels::Replicate::cpu_backward_kernel( input_grad_accessor_cpu, read_only_accessor_from_write_accessor(output_grad_accessor_cpu), num_replicas); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - true); + std::vector result_data_cpu = + load_accessor_data(input_grad_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index f37bbba941..7899afa718 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -42,7 +42,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -142,9 +143,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::size_t reverse_dim_size = 3; std::size_t in_blk_size = 5; - TensorShape input_shape = - make_tensor_shape_from_legion_dims( - {num_out_blks, reverse_dim_size, in_blk_size}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; ManagedPerDeviceFFHandle managed_handle{}; @@ -161,7 +161,7 @@ TEST_SUITE(FF_TEST_SUITE) { // Run GPU Cast Forward Kernel GenericTensorAccessorW input_accessor_gpu = create_transformed_accessor_w( - input_shape, gpu_allocator, transform, false); + input_shape, gpu_allocator, transform); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(output_shape); @@ -173,17 +173,17 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_accessor_gpu.shape.num_elements()); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), false); + std::vector result_data_gpu = + load_accessor_data(output_accessor_gpu); // Run CPU Cast Forward Kernel GenericTensorAccessorW input_accessor_cpu = create_transformed_accessor_w( - input_shape, cpu_allocator, transform, true); + input_shape, cpu_allocator, transform); GenericTensorAccessorW output_accessor_cpu = cpu_allocator.allocate_tensor(output_shape); - Kernels::Reverse::CPU::forward_kernel( + Kernels::Reverse::cpu_forward_kernel( input_accessor_cpu.get_float_ptr(), output_accessor_cpu.get_float_ptr(), num_out_blks, @@ -191,8 +191,8 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_accessor_cpu.shape.num_elements()); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), true); + std::vector result_data_cpu = + load_accessor_data(output_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } @@ -200,7 +200,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { // Run GPU Cast Backward Kernel GenericTensorAccessorW output_grad_accessor_gpu = - create_random_filled_accessor_w(output_shape, gpu_allocator); + create_random_filled_accessor_w(output_shape, + gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = gpu_allocator.allocate_tensor(input_shape); @@ -213,20 +214,18 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_grad_accessor_gpu.shape.num_elements()); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - false); + std::vector result_data_gpu = + load_accessor_data(input_grad_accessor_gpu); // Run CPU Cast Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = copy_tensor_between_memories( read_only_accessor_from_write_accessor(output_grad_accessor_gpu), - output_shape, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = cpu_allocator.allocate_tensor(input_shape); - Kernels::Reverse::CPU::backward_kernel( + Kernels::Reverse::cpu_backward_kernel( output_grad_accessor_cpu.get_float_ptr(), input_grad_accessor_cpu.get_float_ptr(), num_out_blks, @@ -234,9 +233,8 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_grad_accessor_cpu.shape.num_elements()); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - true); + std::vector result_data_cpu = + load_accessor_data(input_grad_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index 5519c30b80..88f24a1a08 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -23,11 +23,13 @@ TEST_SUITE(FF_TEST_SUITE) { managed_handle.raw_handle(), 0, input_n, channels, input_h, input_w); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Softmax::forward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index 34993fa151..9f1d390501 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -27,7 +27,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); std::vector output_ptrs = repeat(num_outputs, [&]() { GenericTensorAccessorW output_accessor = @@ -48,7 +49,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector output_grad_ptrs(num_outputs); for (int i = 0; i < num_outputs; i++) { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index 0bc85cb8e0..c8baaac54f 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -39,7 +39,8 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index 80720801b6..d4511c9dc5 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -14,55 +14,74 @@ using namespace FlexFlow; -enum class GpuDirection { - HostToDevice = 0, - DeviceToHost = 1, - DeviceToDevice = 2 -}; - template -void transfer_memory(DT *dst, +void transfer_memory(GenericTensorAccessorW dst_accessor, const DT *src, - size_t num_elements, - GpuDirection gpu_dir, - bool cpu_memory) { - size_t bytes = num_elements * sizeof(DT); - - if (cpu_memory) { - memcpy(dst, src, bytes); + AllocLocation src_loc) { + size_t bytes = dst_accessor.shape.get_volume() * sizeof(DT); + AllocLocation dst_loc = + dst_accessor.on_device ? AllocLocation::DEVICE : AllocLocation::HOST; + + if (src_loc == AllocLocation::HOST && dst_loc == AllocLocation::HOST) { + memcpy(dst_accessor.ptr, src, bytes); + } else if (src_loc == AllocLocation::HOST && + dst_loc == AllocLocation::DEVICE) { + checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyHostToDevice)); + } else if (src_loc == AllocLocation::DEVICE && + dst_loc == AllocLocation::HOST) { + checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToHost)); } else { - switch (gpu_dir) { - case GpuDirection::HostToDevice: - checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice)); - break; - case GpuDirection::DeviceToHost: - checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost)); - break; - case GpuDirection::DeviceToDevice: - checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice)); - break; - } + checkCUDA( + cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToDevice)); } } +template GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool on_host = false); + Allocator &allocator) { + assert(shape.data_type == DataType::FLOAT || + shape.data_type == DataType::DOUBLE); + using T = real_type
; + + GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + accessor.on_device = + (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; + + std::vector host_data(accessor.shape.num_elements()); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dist(-1.0, 1.0); + + for (auto &val : host_data) { + val = dist(gen); + } + + transfer_memory(accessor, host_data.data(), AllocLocation::HOST); + + return accessor; +} + +template +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_random_filled_accessor_w
(shape, allocator); + + return read_only_accessor_from_write_accessor(accessor); +} template GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, Allocator &allocator, - DT val, - bool on_host = false) { + DT val) { GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); + accessor.on_device = + (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; + + size_t volume = accessor.shape.get_volume(); std::vector
host_data(volume, val); - transfer_memory(static_cast
(accessor.ptr), - host_data.data(), - volume, - GpuDirection::HostToDevice, - on_host); + transfer_memory(accessor, host_data.data(), AllocLocation::HOST); return accessor; } @@ -70,9 +89,11 @@ GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, template GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape, Allocator &allocator, - F transform, - bool on_host = false) { + F transform) { GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + accessor.on_device = + (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; + size_t volume = accessor.shape.get_volume(); std::vector input_data(volume); std::vector output_data(volume); @@ -80,11 +101,7 @@ GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape, std::transform( input_data.begin(), input_data.end(), output_data.begin(), transform); - transfer_memory(static_cast(accessor.ptr), - output_data.data(), - volume, - GpuDirection::HostToDevice, - on_host); + transfer_memory(accessor, output_data.data(), AllocLocation::HOST); return accessor; } @@ -92,42 +109,59 @@ GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape, template GenericTensorAccessorW copy_tensor_between_memories(GenericTensorAccessorR accessor, - TensorShape const &shape, - Allocator &allocator, - bool src_on_host = false) { + Allocator &allocator) { + TensorShape shape = get_tensor_shape(accessor.shape, accessor.data_type); GenericTensorAccessorW copied_accessor = allocator.allocate_tensor(shape); + copied_accessor.on_device = + (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; - size_t volume = accessor.shape.get_volume(); - GpuDirection gpu_dir = - src_on_host ? GpuDirection::HostToDevice : GpuDirection::DeviceToHost; + AllocLocation src_loc = + accessor.on_device ? AllocLocation::DEVICE : AllocLocation::HOST; - transfer_memory( - copied_accessor.get
(), accessor.get
(), volume, gpu_dir, false); + transfer_memory(copied_accessor, accessor.get
(), src_loc); return copied_accessor; } -template -TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DT, - }; -} +TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims, + DataType DT); template -std::vector> load_accessor_data(GenericTensorAccessorR accessor, - bool on_host = false) { +std::vector> load_accessor_data(GenericTensorAccessorR accessor) { + using T = real_type
; + int volume = accessor.shape.get_volume(); + std::vector local_data(volume); + T const *src_ptr = accessor.get
(); + + if (accessor.on_device) { + checkCUDA(cudaMemcpy(local_data.data(), + src_ptr, + volume * sizeof(T), + cudaMemcpyDeviceToHost)); + } else { + memcpy(local_data.data(), src_ptr, volume * sizeof(T)); + } + + return local_data; +} +template +std::vector> load_accessor_data(GenericTensorAccessorW accessor) { using T = real_type
; + + int volume = accessor.shape.get_volume(); std::vector local_data(volume); T const *src_ptr = accessor.get
(); - transfer_memory( - local_data.data(), src_ptr, volume, GpuDirection::DeviceToHost, on_host); + if (accessor.on_device) { + checkCUDA(cudaMemcpy(local_data.data(), + src_ptr, + volume * sizeof(T), + cudaMemcpyDeviceToHost)); + } else { + memcpy(local_data.data(), src_ptr, volume * sizeof(T)); + } return local_data; } diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h index 731e04fdc8..d6f338fe14 100644 --- a/lib/local-execution/include/local-execution/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -12,6 +12,7 @@ struct TrackedAllocator : public IAllocator { ~TrackedAllocator() = default; void *allocate(size_t) override; + void *allocate_and_zero(size_t) override; void deallocate(void *) override; size_t get_current_mem_usage(); diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index e6c3a11711..9f13f006f3 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -12,6 +12,12 @@ void *TrackedAllocator::allocate(size_t requested_memory_size) { return ptr; } +void *TrackedAllocator::allocate_and_zero(size_t requested_memory_size) { + void *ptr = this->allocator.allocate_and_zero(requested_memory_size); + this->current_mem_usage += requested_memory_size; + return ptr; +} + void TrackedAllocator::deallocate(void *ptr) { size_t psize; this->ptr_mem_usage.erase(ptr); @@ -24,7 +30,9 @@ size_t TrackedAllocator::get_current_mem_usage() { } Allocator get_tracked_memory_allocator(Allocator const &base_allocator) { - return Allocator::create(base_allocator); + Allocator allocator = Allocator::create(base_allocator); + allocator.alloc_location = base_allocator.alloc_location; + return allocator; } } // namespace FlexFlow From 64034a585d991703c3f958d263ec82dc8df1b884 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 8 Oct 2024 00:18:45 -0700 Subject: [PATCH 07/42] cpu_kernel's refactor, generic tensor accessor indexing --- lib/kernels/CMakeLists.txt | 1 + lib/kernels/include/kernels/accessor.h | 112 ++++++++++++++--- lib/kernels/include/kernels/allocation.h | 12 +- .../include/kernels/attention_kernels.h | 6 +- .../include/kernels/batch_matmul_kernels.h | 8 +- .../include/kernels/batch_norm_kernels.h | 6 +- lib/kernels/include/kernels/cast_kernels.h | 8 +- .../include/kernels/cast_kernels_cpu.h | 8 +- lib/kernels/include/kernels/combine_kernels.h | 8 +- .../include/kernels/combine_kernels_cpu.h | 8 +- lib/kernels/include/kernels/concat_kernels.h | 8 +- lib/kernels/include/kernels/conv_2d_kernels.h | 6 +- .../include/kernels/datatype_dispatch.h | 3 +- lib/kernels/include/kernels/dropout_kernels.h | 6 +- .../include/kernels/element_binary_kernels.h | 6 +- .../include/kernels/element_unary_kernels.h | 6 +- .../include/kernels/embedding_kernels.h | 8 +- lib/kernels/include/kernels/flat_kernels.h | 8 +- lib/kernels/include/kernels/gather_kernels.h | 6 +- .../include/kernels/layer_norm_kernels.h | 6 +- lib/kernels/include/kernels/linear_kernels.h | 6 +- .../include/kernels/local_cpu_allocator.h | 7 +- .../include/kernels/local_cuda_allocator.h | 3 +- lib/kernels/include/kernels/nccl.h | 8 +- .../include/kernels/partition_kernels.h | 6 +- lib/kernels/include/kernels/pool_2d_kernels.h | 6 +- lib/kernels/include/kernels/reduce_kernels.h | 6 +- .../include/kernels/reduction_kernels.h | 8 +- .../include/kernels/replicate_kernels.h | 8 +- .../include/kernels/replicate_kernels_cpu.h | 8 +- lib/kernels/include/kernels/reshape_kernels.h | 6 +- lib/kernels/include/kernels/reverse_kernels.h | 8 +- .../include/kernels/reverse_kernels_cpu.h | 23 ++-- lib/kernels/include/kernels/softmax_kernels.h | 6 +- lib/kernels/include/kernels/split_kernels.h | 9 +- lib/kernels/include/kernels/topk_kernels.h | 6 +- .../include/kernels/transpose_kernels.h | 6 +- lib/kernels/src/accessor.cc | 118 +++++++++++++++++- lib/kernels/src/allocation.cc | 19 +-- lib/kernels/src/array_shape.cc | 1 + lib/kernels/src/cpu/cast_kernels.cc | 8 +- lib/kernels/src/cpu/combine_kernels.cc | 8 +- lib/kernels/src/cpu/replicate_kernels.cc | 21 ++-- lib/kernels/src/cpu/reverse_kernels.cc | 101 +++++++-------- lib/kernels/src/cuda/ops/concat_kernels.cu | 8 +- lib/kernels/src/local_cpu_allocator.cc | 31 +---- lib/kernels/src/local_cuda_allocator.cc | 13 +- lib/kernels/test/CMakeLists.txt | 1 + lib/kernels/test/src/test_cast_kernel.cc | 24 ++-- lib/kernels/test/src/test_replicate_kernel.cc | 51 ++++---- lib/kernels/test/src/test_reverse_kernels.cc | 53 ++++---- lib/kernels/test/src/test_utils.h | 95 ++++---------- .../local-execution/local_cpu_allocator.h | 2 + .../local-execution/tracked_allocator.h | 4 +- .../src/local_cpu_allocator.cc | 4 + .../src/local_task_argument_accessor.cc | 11 +- lib/local-execution/src/tracked_allocator.cc | 11 +- 57 files changed, 473 insertions(+), 481 deletions(-) diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index 8ccd7c1011..fc91b7d3db 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -30,6 +30,7 @@ target_link_libraries( cudnn nccl utils + pcg ) define_ff_vars(${project_target}) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index e30e1fe825..846115060f 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -5,6 +5,7 @@ #include "device.h" #include "kernels/ff_handle.h" #include "op-attrs/datatype.h" +#include "pcg/device_type.dtg.h" #include "utils/exception.h" #include "utils/required.h" @@ -28,20 +29,65 @@ class GenericTensorAccessorW { double *get_double_ptr() const; half *get_half_ptr() const; - GenericTensorAccessorW(DataType dt, - ArrayShape sh, - req p, - bool on_dev = true) - : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {} + GenericTensorAccessorW() = delete; + + GenericTensorAccessorW(DataType data_type, ArrayShape const &shape, void *ptr, DeviceType device_type); + + bool operator==(GenericTensorAccessorW const &) const; + bool operator!=(GenericTensorAccessorW const &) const; + + template + real_type_t
&at(Indices... indices) { + if (this->device_type != DeviceType::CPU) { + throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); + } + if (this->data_type != DT) { + throw mk_runtime_error( + "Invalid access data type ({} != {})", this->data_type, DT); + } + + using T = real_type_t
; + + T *data_ptr = static_cast(this->ptr); + size_t offset = calculate_index_offset({static_cast(indices)...}); + + return data_ptr[offset]; + } + + template + real_type_t
const &at(Indices... indices) const { + if (this->device_type != DeviceType::CPU) { + throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); + } + if (this->data_type != DT) { + throw mk_runtime_error( + "Invalid access data type ({} != {})", this->data_type, DT); + } + + using T = real_type_t
; + + T const *data_ptr = static_cast(this->ptr); + size_t offset = calculate_index_offset({static_cast(indices)...}); + + return data_ptr[offset]; + } public: DataType data_type; ArrayShape shape; - req ptr; - bool on_device; + void *ptr; + DeviceType device_type; + +private: + std::tuple + tie() const; + + size_t calculate_index_offset( + std::initializer_list const &indices) const; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( - GenericTensorAccessorW, data_type, shape, ptr, on_device); std::string format_as(GenericTensorAccessorW const &); std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); @@ -64,20 +110,50 @@ class GenericTensorAccessorR { double const *get_double_ptr() const; half const *get_half_ptr() const; - GenericTensorAccessorR(DataType dt, - ArrayShape sh, - req p, - bool on_dev = true) - : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {} + GenericTensorAccessorR() = delete; + + GenericTensorAccessorR(DataType data_type, + ArrayShape const &shape, + void const *ptr, + DeviceType device_type); + + bool operator==(GenericTensorAccessorR const &) const; + bool operator!=(GenericTensorAccessorR const &) const; + + template + real_type_t
const &at(Indices... indices) const { + if (this->device_type != DeviceType::CPU) { + throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); + } + if (this->data_type != DT) { + throw mk_runtime_error( + "Invalid access data type ({} != {})", this->data_type, DT); + } + + using T = real_type_t
; + + T const *data_ptr = static_cast(this->ptr); + size_t offset = calculate_index_offset({static_cast(indices)...}); + + return data_ptr[offset]; + } public: DataType data_type; ArrayShape shape; - req ptr; - bool on_device; + void const *ptr; + DeviceType device_type; + +private: + std::tuple + tie() const; + + size_t calculate_index_offset( + std::initializer_list const &indices) const; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( - GenericTensorAccessorR, data_type, shape, ptr, on_device); std::string format_as(GenericTensorAccessorR const &); std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h index 452ccc47b0..893be513ea 100644 --- a/lib/kernels/include/kernels/allocation.h +++ b/lib/kernels/include/kernels/allocation.h @@ -5,15 +5,14 @@ #include #include -enum class AllocLocation { HOST, DEVICE }; - namespace FlexFlow { struct IAllocator { virtual void *allocate(size_t) = 0; - virtual void *allocate_and_zero(size_t) = 0; virtual void deallocate(void *) = 0; + virtual DeviceType get_allocation_device_type() const = 0; + virtual ~IAllocator() = default; }; @@ -21,13 +20,12 @@ struct Allocator { Allocator() = delete; GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape); - GenericTensorAccessorW - allocate_tensor_and_zero(TensorShape const &tensor_shape); void *allocate(size_t mem_size); - void *allocate_and_zero(size_t mem_size); void deallocate(void *ptr); + DeviceType get_allocation_device_type() const; + template static typename std::enable_if::value, Allocator>::type @@ -37,8 +35,6 @@ struct Allocator { Allocator(std::shared_ptr ptr) : i_allocator(ptr){}; - AllocLocation alloc_location; - private: std::shared_ptr i_allocator; }; diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h index eb5a1b8198..1e483102dd 100644 --- a/lib/kernels/include/kernels/attention_kernels.h +++ b/lib/kernels/include/kernels/attention_kernels.h @@ -64,8 +64,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState, std::string format_as(MHAPerDeviceState const &x); std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x); -namespace Kernels { -namespace MultiHeadAttention { +namespace Kernels::MultiHeadAttention { MHAPerDeviceState init_kernel(PerDeviceFFHandle const &, Allocator &, @@ -105,8 +104,7 @@ void backward_kernel(ffStream_t stream, void cleanup_kernel(Allocator &allocator, MHAPerDeviceState const &device_state); -} // namespace MultiHeadAttention -} // namespace Kernels +} // namespace Kernels::MultiHeadAttention } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h index bfd72647b0..bde91bea15 100644 --- a/lib/kernels/include/kernels/batch_matmul_kernels.h +++ b/lib/kernels/include/kernels/batch_matmul_kernels.h @@ -5,9 +5,7 @@ #include "kernels/allocation.h" #include "kernels/ff_handle.h" -namespace FlexFlow { -namespace Kernels { -namespace BatchMatmul { +namespace FlexFlow::Kernels::BatchMatmul { void forward_kernel(ffStream_t stream, PerDeviceFFHandle const &handle, @@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream, int k, int batch); -} // namespace BatchMatmul -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::BatchMatmul #endif diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h index 7d533d672c..4de6ac6af0 100644 --- a/lib/kernels/include/kernels/batch_norm_kernels.h +++ b/lib/kernels/include/kernels/batch_norm_kernels.h @@ -43,8 +43,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(BatchNormPerDeviceState, output_w, relu); -namespace Kernels { -namespace BatchNorm { +namespace Kernels::BatchNorm { BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle, Allocator allocator, @@ -81,8 +80,7 @@ void cleanup_kernel(Allocator allocator, bool relu, float *runningMean); -} // namespace BatchNorm -} // namespace Kernels +} // namespace Kernels::BatchNorm } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h index 502a823ca7..f67613cec6 100644 --- a/lib/kernels/include/kernels/cast_kernels.h +++ b/lib/kernels/include/kernels/cast_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Cast { +namespace FlexFlow::Kernels::Cast { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -20,8 +18,6 @@ void backward_kernel(ffStream_t stream, DataType input_type, DataType output_type); -} // namespace Cast -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Cast #endif diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h index cae0c9da8d..959617dcae 100644 --- a/lib/kernels/include/kernels/cast_kernels_cpu.h +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Cast { +namespace FlexFlow::Kernels::Cast { void cpu_forward_kernel(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, @@ -18,8 +16,6 @@ void cpu_backward_kernel(GenericTensorAccessorR const &input, DataType input_type, DataType output_type); -} // namespace Cast -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Cast #endif diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h index eb263e0734..50de18e823 100644 --- a/lib/kernels/include/kernels/combine_kernels.h +++ b/lib/kernels/include/kernels/combine_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Combine { +namespace FlexFlow::Kernels::Combine { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad); -} // namespace Combine -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Combine #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h index 66c22ddbf8..430c7cf906 100644 --- a/lib/kernels/include/kernels/combine_kernels_cpu.h +++ b/lib/kernels/include/kernels/combine_kernels_cpu.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Combine { +namespace FlexFlow::Kernels::Combine { void cpu_forward_kernel(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); @@ -14,8 +12,6 @@ void cpu_forward_kernel(GenericTensorAccessorR const &input, void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad); -} // namespace Combine -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Combine #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h index a44affc1f2..33355296dd 100644 --- a/lib/kernels/include/kernels/concat_kernels.h +++ b/lib/kernels/include/kernels/concat_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Concat { +namespace FlexFlow::Kernels::Concat { void forward_kernel(ffStream_t stream, GenericTensorAccessorW const &output, @@ -18,8 +16,6 @@ void backward_kernel(ffStream_t stream, std::vector const &input_grads, ff_dim_t axis); -} // namespace Concat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Concat #endif diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index cfc64f963d..217751e191 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -34,8 +34,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState, bwdFilterAlgo, bwdDataAlgo); -namespace Kernels { -namespace Conv2D { +namespace Kernels::Conv2D { Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, std::optional activation, @@ -70,8 +69,7 @@ void backward_kernel(ffStream_t stream, float *bias_grad_ptr, std::optional activation); -} // namespace Conv2D -} // namespace Kernels +} // namespace Kernels::Conv2D } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h index e83fc3325d..0986d99791 100644 --- a/lib/kernels/include/kernels/datatype_dispatch.h +++ b/lib/kernels/include/kernels/datatype_dispatch.h @@ -1,7 +1,8 @@ #ifndef _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H #define _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H -#include "accessor.h" +#include "op-attrs/datatype.h" +#include "utils/exception.h" namespace FlexFlow { diff --git a/lib/kernels/include/kernels/dropout_kernels.h b/lib/kernels/include/kernels/dropout_kernels.h index c0e503be5b..4790540098 100644 --- a/lib/kernels/include/kernels/dropout_kernels.h +++ b/lib/kernels/include/kernels/dropout_kernels.h @@ -31,8 +31,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(DropoutPerDeviceState, reserveSpaceSize, dropoutStateSize); -namespace Kernels { -namespace Dropout { +namespace Kernels::Dropout { DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle, float rate, @@ -56,8 +55,7 @@ void cleanup_kernel(Allocator allocator, ffDropoutDescriptor_t dropoutDesc, void *dropoutStates); -} // namespace Dropout -} // namespace Kernels +} // namespace Kernels::Dropout } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H diff --git a/lib/kernels/include/kernels/element_binary_kernels.h b/lib/kernels/include/kernels/element_binary_kernels.h index 41447e98e6..1017230fb0 100644 --- a/lib/kernels/include/kernels/element_binary_kernels.h +++ b/lib/kernels/include/kernels/element_binary_kernels.h @@ -26,8 +26,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ElementBinaryPerDeviceState, opDesc, reduceAddDesc); -namespace Kernels { -namespace ElementBinary { +namespace Kernels::ElementBinary { ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle, OperatorType op_type, @@ -58,8 +57,7 @@ void backward_kernel(ffStream_t stream, bool broadcast_inputRHS, PerDeviceFFHandle handle); -} // namespace ElementBinary -} // namespace Kernels +} // namespace Kernels::ElementBinary } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h index 8c6864b2d9..26ce4ecaec 100644 --- a/lib/kernels/include/kernels/element_unary_kernels.h +++ b/lib/kernels/include/kernels/element_unary_kernels.h @@ -19,8 +19,7 @@ FF_VISITABLE_STRUCT_NO_EQ(ElementUnaryPerDeviceState, outputTensor, actiDesc); -namespace Kernels { -namespace ElementUnary { +namespace Kernels::ElementUnary { ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape, ArrayShape const &output_shape, @@ -42,8 +41,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &output, GenericTensorAccessorR const &output_grad); -} // namespace ElementUnary -} // namespace Kernels +} // namespace Kernels::ElementUnary } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h index 06582ca1d5..6d5141f489 100644 --- a/lib/kernels/include/kernels/embedding_kernels.h +++ b/lib/kernels/include/kernels/embedding_kernels.h @@ -5,9 +5,7 @@ #include "kernels/accessor.h" #include "op-attrs/ops/embedding.h" -namespace FlexFlow { -namespace Kernels { -namespace Embedding { +namespace FlexFlow::Kernels::Embedding { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, @@ -35,8 +33,6 @@ void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p); template __global__ void rand_generate_int(TD *ptr, size_t size, TD p); -} // namespace Embedding -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Embedding #endif // _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h index 3e600c48de..41b411c937 100644 --- a/lib/kernels/include/kernels/flat_kernels.h +++ b/lib/kernels/include/kernels/flat_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Flat { +namespace FlexFlow::Kernels::Flat { void forward_kernel(ffStream_t stream, GenericTensorAccessorR input, @@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream, float *input_grad_ptr, float const *output_grad_ptr); -} // namespace Flat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Flat #endif // _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h index 13bf4b898a..af2da3b11f 100644 --- a/lib/kernels/include/kernels/gather_kernels.h +++ b/lib/kernels/include/kernels/gather_kernels.h @@ -15,8 +15,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState, handle, legion_dim); -namespace Kernels { -namespace Gather { +namespace Kernels::Gather { void forward_kernel(ffStream_t stream, GatherPerDeviceState const &m, @@ -30,8 +29,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &index, GenericTensorAccessorW const &input_grad); -} // namespace Gather -} // namespace Kernels +} // namespace Kernels::Gather } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h index be13d32879..a6ae87442a 100644 --- a/lib/kernels/include/kernels/layer_norm_kernels.h +++ b/lib/kernels/include/kernels/layer_norm_kernels.h @@ -30,8 +30,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState, bias, data_type); -namespace Kernels { -namespace LayerNorm { +namespace Kernels::LayerNorm { // todo: this may have some problem. LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle, @@ -57,8 +56,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorW const &gamma_grad, GenericTensorAccessorW const &beta_grad); -} // namespace LayerNorm -} // namespace Kernels +} // namespace Kernels::LayerNorm } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h index 3128e39fd0..99549adece 100644 --- a/lib/kernels/include/kernels/linear_kernels.h +++ b/lib/kernels/include/kernels/linear_kernels.h @@ -33,8 +33,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LinearPerDeviceState, weight_type, output_type); -namespace Kernels { -namespace Linear { +namespace Kernels::Linear { LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, float *one_ptr, @@ -72,8 +71,7 @@ void backward_kernel(ffStream_t stream, int out_dim, int batch_size); -} // namespace Linear -} // namespace Kernels +} // namespace Kernels::Linear } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h index 121ed184e9..cf6cfe35d1 100644 --- a/lib/kernels/include/kernels/local_cpu_allocator.h +++ b/lib/kernels/include/kernels/local_cpu_allocator.h @@ -7,14 +7,15 @@ struct LocalCPUAllocator : public IAllocator { LocalCPUAllocator() = default; LocalCPUAllocator(LocalCPUAllocator const &) = delete; LocalCPUAllocator(LocalCPUAllocator &&) = delete; - ~LocalCPUAllocator() override; + ~LocalCPUAllocator() = default; void *allocate(size_t) override; - void *allocate_and_zero(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: - std::unordered_set ptrs; + std::unordered_map> ptrs; }; CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator); diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h index 16f60daead..b8e0540974 100644 --- a/lib/kernels/include/kernels/local_cuda_allocator.h +++ b/lib/kernels/include/kernels/local_cuda_allocator.h @@ -10,9 +10,10 @@ struct LocalCudaAllocator : public IAllocator { ~LocalCudaAllocator() override; void *allocate(size_t) override; - void *allocate_and_zero(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: std::unordered_set ptrs; }; diff --git a/lib/kernels/include/kernels/nccl.h b/lib/kernels/include/kernels/nccl.h index b8a6784676..042911d172 100644 --- a/lib/kernels/include/kernels/nccl.h +++ b/lib/kernels/include/kernels/nccl.h @@ -23,15 +23,11 @@ struct ncclUniqueId {}; struct ncclComm_t {}; #endif -namespace FlexFlow { -namespace Kernels { -namespace NCCL { +namespace FlexFlow::Kernels::NCCL { ncclUniqueId generate_unique_id(); ncclComm_t create_comm(ncclUniqueId const &, int num_ranks, int my_rank); -} // namespace NCCL -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::NCCL #endif diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h index 64ef1a1352..e580c4a9de 100644 --- a/lib/kernels/include/kernels/partition_kernels.h +++ b/lib/kernels/include/kernels/partition_kernels.h @@ -13,8 +13,7 @@ struct RepartitionPerDeviceState { FF_VISITABLE_STRUCT_NO_EQ(RepartitionPerDeviceState, handle, data_type); -namespace Kernels { -namespace Repartition { +namespace Kernels::Repartition { RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle, DataType data_type); @@ -29,8 +28,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorW const &output_grad, GenericTensorAccessorR const &input_grad); -} // namespace Repartition -} // namespace Kernels +} // namespace Kernels::Repartition } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h index 798c0507f8..191c23bc98 100644 --- a/lib/kernels/include/kernels/pool_2d_kernels.h +++ b/lib/kernels/include/kernels/pool_2d_kernels.h @@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Pool2DPerDeviceState, poolDesc, relu); -namespace Kernels { -namespace Pool2D { +namespace Kernels::Pool2D { Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle, std::optional activation, @@ -75,8 +74,7 @@ void backward_kernel(ffStream_t stream, void const *output_ptr, void const *output_grad_ptr); -} // namespace Pool2D -} // namespace Kernels +} // namespace Kernels::Pool2D } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h index 4287472875..cd3930ea1c 100644 --- a/lib/kernels/include/kernels/reduce_kernels.h +++ b/lib/kernels/include/kernels/reduce_kernels.h @@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT(ReducePerDeviceState, op_type, reduction_size); -namespace Kernels { -namespace Reduce { +namespace Kernels::Reduce { ReducePerDeviceState init_kernel(PerDeviceFFHandle const &, OperatorType const &, @@ -43,8 +42,7 @@ void backward_kernel(ffStream_t stream, ReducePerDeviceState const &m, float const *output_grad_ptr, float *input_grad_ptr); -} // namespace Reduce -} // namespace Kernels +} // namespace Kernels::Reduce } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h index fb3baf215c..7e1e240ea4 100644 --- a/lib/kernels/include/kernels/reduction_kernels.h +++ b/lib/kernels/include/kernels/reduction_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Reduction { +namespace FlexFlow::Kernels::Reduction { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -17,8 +15,6 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output); -} // namespace Reduction -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reduction #endif // _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h index 409fc81f44..877eeabf04 100644 --- a/lib/kernels/include/kernels/replicate_kernels.h +++ b/lib/kernels/include/kernels/replicate_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Replicate { +namespace FlexFlow::Kernels::Replicate { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -17,8 +15,6 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &output, size_t num_replicas); -} // namespace Replicate -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Replicate #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h index 11d2f1bf5c..a72b799875 100644 --- a/lib/kernels/include/kernels/replicate_kernels_cpu.h +++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Replicate { +namespace FlexFlow::Kernels::Replicate { void cpu_forward_kernel(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); @@ -15,8 +13,6 @@ void cpu_backward_kernel(GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, size_t num_replicas); -} // namespace Replicate -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Replicate #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index a83caa6bea..5fa4382c43 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -13,8 +13,7 @@ struct ReshapePerDeviceState { FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type); -namespace Kernels { -namespace Reshape { +namespace Kernels::Reshape { ReshapePerDeviceState init_kernel(DataType data_type); @@ -28,8 +27,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output); -} // namespace Reshape -} // namespace Kernels +} // namespace Kernels::Reshape } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H diff --git a/lib/kernels/include/kernels/reverse_kernels.h b/lib/kernels/include/kernels/reverse_kernels.h index 42a83ae219..deb5b22155 100644 --- a/lib/kernels/include/kernels/reverse_kernels.h +++ b/lib/kernels/include/kernels/reverse_kernels.h @@ -3,9 +3,7 @@ #include "device.h" -namespace FlexFlow { -namespace Kernels { -namespace Reverse { +namespace FlexFlow::Kernels::Reverse { void forward_kernel(ffStream_t stream, float const *in_ptr, @@ -23,8 +21,6 @@ void backward_kernel(ffStream_t stream, coord_t in_blk_size, coord_t input_size); -} // namespace Reverse -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reverse #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h index bb17aa9400..b0edaa264c 100644 --- a/lib/kernels/include/kernels/reverse_kernels_cpu.h +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -1,27 +1,22 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H +#include "accessor.h" #include "device.h" -namespace FlexFlow { -namespace Kernels { -namespace Reverse { +namespace FlexFlow::Kernels::Reverse { -void cpu_forward_kernel(float const *in_ptr, - float *out_ptr, +void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, coord_t num_out_blks, coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size); + coord_t in_blk_size); -void cpu_backward_kernel(float const *out_grad_ptr, - float *in_grad_ptr, +void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor, coord_t num_out_blks, coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t input_size); -} // namespace Reverse -} // namespace Kernels -} // namespace FlexFlow + coord_t in_blk_size); +} // namespace FlexFlow::Kernels::Reverse #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h index 061230ec52..93135cb648 100644 --- a/lib/kernels/include/kernels/softmax_kernels.h +++ b/lib/kernels/include/kernels/softmax_kernels.h @@ -15,8 +15,7 @@ struct SoftmaxPerDeviceState { FF_VISITABLE_STRUCT(SoftmaxPerDeviceState, handle, inputTensor, dim); -namespace Kernels { -namespace Softmax { +namespace Kernels::Softmax { SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle, int dim, @@ -35,8 +34,7 @@ void backward_kernel(ffStream_t stream, float const *output_grad_ptr, size_t num_elements); -} // namespace Softmax -} // namespace Kernels +} // namespace Kernels::Softmax } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/split_kernels.h b/lib/kernels/include/kernels/split_kernels.h index 36434d4be8..538b9602c2 100644 --- a/lib/kernels/include/kernels/split_kernels.h +++ b/lib/kernels/include/kernels/split_kernels.h @@ -3,10 +3,7 @@ #include "device.h" -namespace FlexFlow { - -namespace Kernels { -namespace Split { +namespace FlexFlow::Kernels::Split { void forward_kernel(ffStream_t stream, float **out_ptrs, float const *in_ptr, @@ -22,8 +19,6 @@ void backward_kernel(ffStream_t stream, coord_t num_blks, int numOutputs); -} // namespace Split -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Split #endif // _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H diff --git a/lib/kernels/include/kernels/topk_kernels.h b/lib/kernels/include/kernels/topk_kernels.h index ae1c739f6c..6f33381e1a 100644 --- a/lib/kernels/include/kernels/topk_kernels.h +++ b/lib/kernels/include/kernels/topk_kernels.h @@ -12,8 +12,7 @@ struct TopKPerDeviceState { FF_VISITABLE_STRUCT(TopKPerDeviceState, sorted); -namespace Kernels { -namespace TopK { +namespace Kernels::TopK { TopKPerDeviceState init_kernel(bool sorted); @@ -35,8 +34,7 @@ void backward_kernel(ffStream_t stream, int length, int k); -} // namespace TopK -} // namespace Kernels +} // namespace Kernels::TopK } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h index 56da81ba2b..b48b7e0aa8 100644 --- a/lib/kernels/include/kernels/transpose_kernels.h +++ b/lib/kernels/include/kernels/transpose_kernels.h @@ -16,8 +16,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(TransposePerDeviceState, num_dim, perm); -namespace Kernels { -namespace Transpose { +namespace Kernels::Transpose { TransposePerDeviceState init_kernel(int num_dim, std::vector const &perm); @@ -32,8 +31,7 @@ void backward_kernel(cudaStream_t stream, GenericTensorAccessorW const &in_grad, GenericTensorAccessorR const &out_grad); -} // namespace Transpose -} // namespace Kernels +} // namespace Kernels::Transpose } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index 66d3c02300..c0b11a2299 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -2,6 +2,64 @@ namespace FlexFlow { +GenericTensorAccessorW::GenericTensorAccessorW( + DataType data_type, + ArrayShape const &shape, + void *ptr, + DeviceType device_type = DeviceType::GPU) + : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + +std::tuple + GenericTensorAccessorW::tie() const { + return std::tie(this->data_type, this->shape, this->ptr, this->device_type); +} + +size_t GenericTensorAccessorW::calculate_index_offset( + std::initializer_list const &indices) const { + + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error( + "Number of indices ({}) does not match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims()); + } + + size_t offset = 0; + size_t multiplier = 1; + size_t cur_idx; + auto it = indices.end() - 1; + + for (std::size_t i = this->shape.num_dims(); i-- > 0;) { + cur_idx = *it--; + + if (cur_idx >= this->shape[legion_dim_t(i)]) { + throw mk_runtime_error("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + cur_idx, + this->shape[legion_dim_t(i)]); + } + + offset += cur_idx * multiplier; + multiplier *= this->shape[legion_dim_t(i)]; + } + + return offset; +} + +bool GenericTensorAccessorW::operator==( + GenericTensorAccessorW const &other) const { + return this->tie() == other.tie(); +} + +bool GenericTensorAccessorW::operator!=( + GenericTensorAccessorW const &other) const { + return this->tie() != other.tie(); +} + int32_t *GenericTensorAccessorW::get_int32_ptr() const { return this->get(); } @@ -33,6 +91,64 @@ std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) { return (s << fmt::to_string(a)); } +GenericTensorAccessorR::GenericTensorAccessorR( + DataType data_type, + ArrayShape const &shape, + void const *ptr, + DeviceType device_type = DeviceType::GPU) + : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + +std::tuple + GenericTensorAccessorR::tie() const { + return std::tie(this->data_type, this->shape, this->ptr, this->device_type); +} + +size_t GenericTensorAccessorR::calculate_index_offset( + std::initializer_list const &indices) const { + + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error( + "Number of indices ({}) does not match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims()); + } + + size_t offset = 0; + size_t multiplier = 1; + size_t cur_idx; + auto it = indices.end() - 1; + + for (std::size_t i = this->shape.num_dims(); i-- > 0;) { + cur_idx = *it--; + + if (cur_idx >= this->shape[legion_dim_t(i)]) { + throw mk_runtime_error("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + cur_idx, + this->shape[legion_dim_t(i)]); + } + + offset += cur_idx * multiplier; + multiplier *= this->shape[legion_dim_t(i)]; + } + + return offset; +} + +bool GenericTensorAccessorR::operator==( + GenericTensorAccessorR const &other) const { + return this->tie() == other.tie(); +} + +bool GenericTensorAccessorR::operator!=( + GenericTensorAccessorR const &other) const { + return this->tie() != other.tie(); +} + int32_t const *GenericTensorAccessorR::get_int32_ptr() const { return this->get(); } @@ -159,7 +275,7 @@ GenericTensorAccessorR read_only_accessor_from_write_accessor( return GenericTensorAccessorR{writable.data_type, writable.shape, req(writable.ptr), - writable.on_device}; + writable.device_type}; } bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index ce06fbabe0..751cdc0ebb 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -7,26 +7,19 @@ void *Allocator::allocate(size_t mem_size) { return this->i_allocator->allocate(mem_size); } -void *Allocator::allocate_and_zero(size_t mem_size) { - return this->i_allocator->allocate_and_zero(mem_size); -} - void Allocator::deallocate(void *ptr) { this->i_allocator->deallocate(ptr); } -GenericTensorAccessorW - Allocator::allocate_tensor(TensorShape const &tensor_shape) { - void *ptr = this->allocate(get_size_in_bytes(tensor_shape)); - bool on_device = this->alloc_location == AllocLocation::DEVICE; - return {tensor_shape.data_type, tensor_shape, ptr, on_device}; +DeviceType Allocator::get_allocation_device_type() const { + return this->i_allocator->get_allocation_device_type(); } GenericTensorAccessorW - Allocator::allocate_tensor_and_zero(TensorShape const &tensor_shape) { - void *ptr = this->allocate_and_zero(get_size_in_bytes(tensor_shape)); - bool on_device = this->alloc_location == AllocLocation::DEVICE; - return {tensor_shape.data_type, tensor_shape, ptr, on_device}; + Allocator::allocate_tensor(TensorShape const &tensor_shape) { + void *ptr = this->allocate(get_size_in_bytes(tensor_shape)); + return { + tensor_shape.data_type, tensor_shape, ptr, get_allocation_device_type()}; } } // namespace FlexFlow diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index d5e2f1167d..5c18a9ab5a 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -53,6 +53,7 @@ std::size_t ArrayShape::at(ff_dim_t idx) const { ArrayShape ArrayShape::sub_shape( std::optional> start, std::optional> end) const { + NOT_IMPLEMENTED(); } diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc index 5888d9a96a..2d3f440c75 100644 --- a/lib/kernels/src/cpu/cast_kernels.cc +++ b/lib/kernels/src/cpu/cast_kernels.cc @@ -1,9 +1,7 @@ #include "kernels/cast_kernels_cpu.h" #include "kernels/datatype_dispatch.h" -namespace FlexFlow { -namespace Kernels { -namespace Cast { +namespace FlexFlow::Kernels::Cast { template void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) { @@ -53,6 +51,4 @@ void cpu_backward_kernel(GenericTensorAccessorR const &input, input_type, output_type, input, output); } -} // namespace Cast -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc index e48f4c3e01..d0be1f9f2d 100644 --- a/lib/kernels/src/cpu/combine_kernels.cc +++ b/lib/kernels/src/cpu/combine_kernels.cc @@ -1,9 +1,7 @@ #include "kernels/combine_kernels_cpu.h" #include "kernels/datatype_dispatch.h" -namespace FlexFlow { -namespace Kernels { -namespace Combine { +namespace FlexFlow::Kernels::Combine { template struct CPUForwardKernel { @@ -37,6 +35,4 @@ void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, input_grad.data_type, output_grad, input_grad); } -} // namespace Combine -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Combine diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index 239baf4041..5853869047 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -1,26 +1,22 @@ #include "kernels/datatype_dispatch.h" #include "kernels/replicate_kernels_cpu.h" -namespace FlexFlow { -namespace Kernels { -namespace Replicate { +namespace FlexFlow::Kernels::Replicate { template void cpu_replicate_backward_kernel(T *input, T const *output, size_t num_elements, size_t num_replicas) { - for (size_t i = 0; i < num_elements; ++i) { + for (size_t i = 0; i < num_elements; i++) { T sum = 0; - for (size_t j = 0; j < num_replicas; ++j) { + for (size_t j = 0; j < num_replicas; j++) { sum += output[i + j * num_elements]; } input[i] = sum; } } -// Why does replicate forward seem to only transfer memory? Shouldn't it also -// handle the replication? template struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, @@ -36,9 +32,10 @@ struct CPUBackwardKernel { void operator()(GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, size_t num_replicas) { - size_t total_elements = input.shape.num_elements() * num_replicas; - cpu_replicate_backward_kernel( - input.get(), output.get(), total_elements, num_replicas); + cpu_replicate_backward_kernel(input.get(), + output.get(), + input.shape.num_elements(), + num_replicas); } }; @@ -54,6 +51,4 @@ void cpu_backward_kernel(GenericTensorAccessorW const &input, input.data_type, input, output, num_replicas); } -} // namespace Replicate -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index 350dad03e9..1971435d8c 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -2,77 +2,66 @@ #include #include -namespace FlexFlow { -namespace Kernels { -namespace Reverse { +namespace FlexFlow::Kernels::Reverse { -void cpu_reverse_forward_kernel(float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size) { - coord_t total_elements = num_out_blks * reverse_dim_size * in_blk_size; +template +struct CPUReverseForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size) { + assert(input.data_type == DT && output.data_type == DT); - std::vector> in_blocks(num_out_blks * reverse_dim_size, - std::vector(in_blk_size)); - - // For each output block, copy the input block into in_blocks - for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { - // Each output block has reverse_dim_size input blocks - for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) { - coord_t start_idx = (blk_idx * reverse_dim_size + rev_idx) * in_blk_size; - - // Copy elements from in_ptr to the current block in in_blocks - std::vector ¤t_block = - in_blocks[blk_idx * reverse_dim_size + rev_idx]; - for (coord_t i = 0; i < in_blk_size; ++i) { - current_block[i] = in_ptr[start_idx + i]; + // For each output block, copy the input block + for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { + for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) { + for (coord_t i = 0; i < in_blk_size; ++i) { + output.at
(blk_idx, rev_idx, i) = + input.at
(blk_idx, rev_idx, i); + } } } - } - // Reverse the in_blocks within each output block - for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { - auto block_start = in_blocks.begin() + blk_idx * reverse_dim_size; - auto block_end = block_start + reverse_dim_size; - std::reverse(block_start, block_end); - } - - // Copy the reversed blocks to the output array - for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { - for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) { - coord_t start_idx = (blk_idx * reverse_dim_size + rev_idx) * in_blk_size; + // Reverse the blocks within each output block + for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { + for (coord_t rev_idx = 0; rev_idx < reverse_dim_size / 2; ++rev_idx) { + coord_t start_idx = rev_idx; + coord_t end_idx = reverse_dim_size - 1 - rev_idx; - // Copy elements from the current block in in_blocks to out_ptr - std::vector const ¤t_block = - in_blocks[blk_idx * reverse_dim_size + rev_idx]; - for (coord_t i = 0; i < in_blk_size; ++i) { - out_ptr[start_idx + i] = current_block[i]; + for (coord_t i = 0; i < in_blk_size; ++i) { + std::swap(output.at
(blk_idx, start_idx, i), + output.at
(blk_idx, end_idx, i)); + } } } } -} +}; -void cpu_forward_kernel(float const *in_ptr, - float *out_ptr, +void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, coord_t num_out_blks, coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size) { - cpu_reverse_forward_kernel( - in_ptr, out_ptr, num_out_blks, reverse_dim_size, in_blk_size); + coord_t in_blk_size) { + DataTypeDispatch1{}(input_accessor.data_type, + input_accessor, + std::ref(output_accessor), + num_out_blks, + reverse_dim_size, + in_blk_size); } -void cpu_backward_kernel(float const *out_grad_ptr, - float *in_grad_ptr, +void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor, coord_t num_out_blks, coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t input_size) { - cpu_reverse_forward_kernel( - out_grad_ptr, in_grad_ptr, num_out_blks, reverse_dim_size, in_blk_size); + coord_t in_blk_size) { + DataTypeDispatch1{}(output_accessor.data_type, + output_accessor, + std::ref(input_accessor), + num_out_blks, + reverse_dim_size, + in_blk_size); } -} // namespace Reverse -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu index 68004738d2..ad216feda2 100644 --- a/lib/kernels/src/cuda/ops/concat_kernels.cu +++ b/lib/kernels/src/cuda/ops/concat_kernels.cu @@ -17,9 +17,7 @@ #include "kernels/concat_kernels.h" #include -namespace FlexFlow { -namespace Kernels { -namespace Concat { +namespace FlexFlow::Kernels::Concat { void calc_blk_size(size_t &num_blocks, size_t &blk_size, @@ -87,6 +85,4 @@ void backward_kernel(cudaStream_t stream, } } -} // namespace Concat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Concat diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc index ced707edcc..5cf337c685 100644 --- a/lib/kernels/src/local_cpu_allocator.cc +++ b/lib/kernels/src/local_cpu_allocator.cc @@ -1,34 +1,16 @@ #include "kernels/local_cpu_allocator.h" #include "kernels/device.h" +#include "utils/containers/contains_key.h" namespace FlexFlow { void *LocalCPUAllocator::allocate(size_t requested_memory_size) { void *ptr = malloc(requested_memory_size); - - if (ptr != nullptr) { - this->ptrs.insert(ptr); - } else { - throw std::bad_alloc(); - } - - return ptr; -} - -void *LocalCPUAllocator::allocate_and_zero(size_t requested_memory_size) { - void *ptr = calloc(1, requested_memory_size); - - if (ptr != nullptr) { - this->ptrs.insert(ptr); - } else { - throw std::bad_alloc(); - } - + this->ptrs.insert({ptr, std::unique_ptr(ptr, free)}); return ptr; } void LocalCPUAllocator::deallocate(void *ptr) { - if (contains(this->ptrs, ptr)) { - free(ptr); + if (contains_key(this->ptrs, ptr)) { this->ptrs.erase(ptr); } else { throw std::runtime_error( @@ -36,15 +18,12 @@ void LocalCPUAllocator::deallocate(void *ptr) { } } -LocalCPUAllocator::~LocalCPUAllocator() { - for (void *ptr : this->ptrs) { - free(ptr); - } +DeviceType LocalCPUAllocator::get_allocation_device_type() const { + return DeviceType::CPU; } Allocator create_local_cpu_memory_allocator() { Allocator allocator = Allocator::create(); - allocator.alloc_location = AllocLocation::HOST; return allocator; } diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc index b6c615a5ca..416768a479 100644 --- a/lib/kernels/src/local_cuda_allocator.cc +++ b/lib/kernels/src/local_cuda_allocator.cc @@ -10,14 +10,6 @@ void *LocalCudaAllocator::allocate(size_t requested_memory_size) { return ptr; } -void *LocalCudaAllocator::allocate_and_zero(size_t requested_memory_size) { - void *ptr; - checkCUDA(cudaMalloc(&ptr, requested_memory_size)); - checkCUDA(cudaMemset(ptr, 0, requested_memory_size)); - this->ptrs.insert(ptr); - return ptr; -} - void LocalCudaAllocator::deallocate(void *ptr) { if (contains(this->ptrs, ptr)) { checkCUDA(cudaFree(ptr)); @@ -28,6 +20,10 @@ void LocalCudaAllocator::deallocate(void *ptr) { } } +DeviceType LocalCudaAllocator::get_allocation_device_type() const { + return DeviceType::GPU; +} + LocalCudaAllocator::~LocalCudaAllocator() { for (void *ptr : this->ptrs) { checkCUDA(cudaFree(ptr)); @@ -36,7 +32,6 @@ LocalCudaAllocator::~LocalCudaAllocator() { Allocator create_local_cuda_memory_allocator() { Allocator allocator = Allocator::create(); - allocator.alloc_location = AllocLocation::DEVICE; return allocator; } diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt index 00da2d0d70..066cb96753 100644 --- a/lib/kernels/test/CMakeLists.txt +++ b/lib/kernels/test/CMakeLists.txt @@ -14,6 +14,7 @@ ff_add_test_executable( cudnn cudart cublas + pcg ) set(FF_TEST_EXEC_NAME "kernels-tests") diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index a6990d2ad0..e9674cd167 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -94,44 +94,34 @@ TEST_SUITE(FF_TEST_SUITE) { // Only calling forward kernel as backward kernel is exactly the same SUBCASE("forward_kernel") { - auto transform = [start_val = 1.1f, - counter = 0.0f](float input) mutable -> float { - return start_val + counter++; - }; - // Run GPU Forward Kernel GenericTensorAccessorW input_accessor_gpu = - create_transformed_accessor_w( - input_shape, gpu_allocator, transform); + create_random_filled_accessor_w(input_shape, + gpu_allocator); Kernels::Cast::forward_kernel( managed_stream.raw_stream(), read_only_accessor_from_write_accessor(input_accessor_gpu), output_accessor_gpu, DataType::FLOAT, DataType::INT32); - std::cout << "Before GPU load" << std::endl; + std::vector result_data_gpu = load_accessor_data(output_accessor_gpu); // Run CPU Forward Kernel GenericTensorAccessorW input_accessor_cpu = - create_transformed_accessor_w( - input_shape, cpu_allocator, transform); + create_random_filled_accessor_w(input_shape, + cpu_allocator); Kernels::Cast::cpu_forward_kernel( read_only_accessor_from_write_accessor(input_accessor_cpu), output_accessor_cpu, DataType::FLOAT, DataType::INT32); - std::cout << "Before CPU load" << std::endl; - if (output_accessor_cpu.on_device) { - std::cout << "CPU data is on device" << std::endl; - } else { - std::cout << "CPU data is on host" << std::endl; - } + std::vector result_data_cpu = load_accessor_data(output_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index e952f1107f..8af741b3a7 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -112,17 +112,12 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("Check Replicate Forward Kernel against CPU Kernel") { - std::size_t num_replicas = 10; + std::size_t num_replicas = 2; - // This should be like three shapes: pre_replication, replication shape, and - // reduced shape, but things are weird cause doesn't seem to be replicating - // anything (ie. input shape should be same as reduced shape) TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT); - TensorShape replicated_shape = - make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT); - TensorShape reduced_shape = - make_tensor_shape_from_legion_dims({10}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({5}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -136,7 +131,8 @@ TEST_SUITE(FF_TEST_SUITE) { create_random_filled_accessor_r(input_shape, gpu_allocator); GenericTensorAccessorW output_accessor_gpu = - gpu_allocator.allocate_tensor(replicated_shape); + gpu_allocator.allocate_tensor(output_shape); + fill_with_zeros(output_accessor_gpu); Kernels::Replicate::forward_kernel( managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); @@ -145,29 +141,29 @@ TEST_SUITE(FF_TEST_SUITE) { load_accessor_data(output_accessor_gpu); // Run CPU Replicate Forward Kernel - GenericTensorAccessorW input_accessor_cpu = - copy_tensor_between_memories(input_accessor_gpu, - cpu_allocator); + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); GenericTensorAccessorW output_accessor_cpu = - cpu_allocator.allocate_tensor(replicated_shape); + cpu_allocator.allocate_tensor(output_shape); + fill_with_zeros(output_accessor_cpu); - Kernels::Replicate::cpu_forward_kernel( - read_only_accessor_from_write_accessor(input_accessor_cpu), - output_accessor_cpu); + Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); std::vector result_data_cpu = load_accessor_data(output_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } SUBCASE("backward_kernel") { // Run GPU Replicate Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = - create_random_filled_accessor_r(replicated_shape, + create_random_filled_accessor_r(output_shape, gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor_and_zero(reduced_shape); + gpu_allocator.allocate_tensor(input_shape); + fill_with_zeros(input_grad_accessor_gpu); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), input_grad_accessor_gpu, @@ -178,21 +174,20 @@ TEST_SUITE(FF_TEST_SUITE) { load_accessor_data(input_grad_accessor_gpu); // Run CPU Replicate Backward Kernel - GenericTensorAccessorW output_grad_accessor_cpu = - copy_tensor_between_memories( - output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor_and_zero(reduced_shape); + cpu_allocator.allocate_tensor(input_shape); + fill_with_zeros(input_grad_accessor_cpu); Kernels::Replicate::cpu_backward_kernel( - input_grad_accessor_cpu, - read_only_accessor_from_write_accessor(output_grad_accessor_cpu), - num_replicas); + input_grad_accessor_cpu, output_grad_accessor_cpu, num_replicas); std::vector result_data_cpu = load_accessor_data(input_grad_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } } } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 7899afa718..b1f90a0a7e 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -159,11 +159,12 @@ TEST_SUITE(FF_TEST_SUITE) { }; // Run GPU Cast Forward Kernel - GenericTensorAccessorW input_accessor_gpu = - create_transformed_accessor_w( - input_shape, gpu_allocator, transform); + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, + gpu_allocator); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(output_shape); + fill_with_zeros(output_accessor_gpu); Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), input_accessor_gpu.get_float_ptr(), @@ -177,33 +178,32 @@ TEST_SUITE(FF_TEST_SUITE) { load_accessor_data(output_accessor_gpu); // Run CPU Cast Forward Kernel - GenericTensorAccessorW input_accessor_cpu = - create_transformed_accessor_w( - input_shape, cpu_allocator, transform); + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); GenericTensorAccessorW output_accessor_cpu = cpu_allocator.allocate_tensor(output_shape); + fill_with_zeros(output_accessor_cpu); - Kernels::Reverse::cpu_forward_kernel( - input_accessor_cpu.get_float_ptr(), - output_accessor_cpu.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - input_accessor_cpu.shape.num_elements()); + Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu, + num_out_blks, + reverse_dim_size, + in_blk_size); std::vector result_data_cpu = load_accessor_data(output_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } SUBCASE("backward_kernel") { // Run GPU Cast Backward Kernel - GenericTensorAccessorW output_grad_accessor_gpu = - create_random_filled_accessor_w(output_shape, + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = gpu_allocator.allocate_tensor(input_shape); + fill_with_zeros(input_grad_accessor_gpu); Kernels::Reverse::backward_kernel( managed_stream.raw_stream(), @@ -218,25 +218,22 @@ TEST_SUITE(FF_TEST_SUITE) { load_accessor_data(input_grad_accessor_gpu); // Run CPU Cast Backward Kernel - GenericTensorAccessorW output_grad_accessor_cpu = - copy_tensor_between_memories( - read_only_accessor_from_write_accessor(output_grad_accessor_gpu), - cpu_allocator); + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = cpu_allocator.allocate_tensor(input_shape); + fill_with_zeros(input_grad_accessor_cpu); - Kernels::Reverse::cpu_backward_kernel( - output_grad_accessor_cpu.get_float_ptr(), - input_grad_accessor_cpu.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - input_grad_accessor_cpu.shape.num_elements()); + Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu, + input_grad_accessor_cpu, + num_out_blks, + reverse_dim_size, + in_blk_size); std::vector result_data_cpu = load_accessor_data(input_grad_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } } } diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index d4511c9dc5..a9d522b948 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_TEST_UTILS #define _FLEXFLOW_KERNELS_TEST_UTILS +#include "kernels/datatype_dispatch.h" #include "kernels/device.h" #include "kernels/local_cpu_allocator.h" #include "kernels/local_cuda_allocator.h" @@ -17,18 +18,16 @@ using namespace FlexFlow; template void transfer_memory(GenericTensorAccessorW dst_accessor, const DT *src, - AllocLocation src_loc) { + DeviceType src_device_type) { size_t bytes = dst_accessor.shape.get_volume() * sizeof(DT); - AllocLocation dst_loc = - dst_accessor.on_device ? AllocLocation::DEVICE : AllocLocation::HOST; - if (src_loc == AllocLocation::HOST && dst_loc == AllocLocation::HOST) { + DeviceType dst_device_type = dst_accessor.device_type; + + if (device_on_cpu(src_device_type) && device_on_cpu(dst_device_type)) { memcpy(dst_accessor.ptr, src, bytes); - } else if (src_loc == AllocLocation::HOST && - dst_loc == AllocLocation::DEVICE) { + } else if (device_on_cpu(src_device_type) && device_on_gpu(dst_device_type)) { checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyHostToDevice)); - } else if (src_loc == AllocLocation::DEVICE && - dst_loc == AllocLocation::HOST) { + } else if (device_on_gpu(src_device_type) && device_on_cpu(dst_device_type)) { checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToHost)); } else { checkCUDA( @@ -41,11 +40,10 @@ GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, Allocator &allocator) { assert(shape.data_type == DataType::FLOAT || shape.data_type == DataType::DOUBLE); - using T = real_type
; + + using T = real_type_t
; GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - accessor.on_device = - (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; std::vector host_data(accessor.shape.num_elements()); std::random_device rd; @@ -56,7 +54,7 @@ GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, val = dist(gen); } - transfer_memory(accessor, host_data.data(), AllocLocation::HOST); + transfer_memory(accessor, host_data.data(), DeviceType::CPU); return accessor; } @@ -64,103 +62,64 @@ GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, template GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, Allocator &allocator) { + using T = real_type_t
; GenericTensorAccessorW accessor = create_random_filled_accessor_w
(shape, allocator); return read_only_accessor_from_write_accessor(accessor); } -template +template GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, Allocator &allocator, - DT val) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - accessor.on_device = - (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; - - size_t volume = accessor.shape.get_volume(); - std::vector
host_data(volume, val); - - transfer_memory(accessor, host_data.data(), AllocLocation::HOST); - - return accessor; -} - -template -GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape, - Allocator &allocator, - F transform) { + T val) { GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - accessor.on_device = - (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; size_t volume = accessor.shape.get_volume(); - std::vector input_data(volume); - std::vector output_data(volume); - - std::transform( - input_data.begin(), input_data.end(), output_data.begin(), transform); + std::vector host_data(volume, val); - transfer_memory(accessor, output_data.data(), AllocLocation::HOST); + transfer_memory(accessor, host_data.data(), DeviceType::CPU); return accessor; } template -GenericTensorAccessorW - copy_tensor_between_memories(GenericTensorAccessorR accessor, - Allocator &allocator) { - TensorShape shape = get_tensor_shape(accessor.shape, accessor.data_type); - GenericTensorAccessorW copied_accessor = allocator.allocate_tensor(shape); - copied_accessor.on_device = - (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; - - AllocLocation src_loc = - accessor.on_device ? AllocLocation::DEVICE : AllocLocation::HOST; - - transfer_memory(copied_accessor, accessor.get
(), src_loc); - - return copied_accessor; -} - -TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims, - DataType DT); - -template -std::vector> load_accessor_data(GenericTensorAccessorR accessor) { - using T = real_type
; +std::vector> + load_accessor_data(GenericTensorAccessorR accessor) { + using T = real_type_t
; int volume = accessor.shape.get_volume(); std::vector local_data(volume); T const *src_ptr = accessor.get
(); - if (accessor.on_device) { + if (device_on_cpu(accessor.device_type)) { + memcpy(local_data.data(), src_ptr, volume * sizeof(T)); + } else { checkCUDA(cudaMemcpy(local_data.data(), src_ptr, volume * sizeof(T), cudaMemcpyDeviceToHost)); - } else { - memcpy(local_data.data(), src_ptr, volume * sizeof(T)); } return local_data; } template -std::vector> load_accessor_data(GenericTensorAccessorW accessor) { - using T = real_type
; +std::vector> + load_accessor_data(GenericTensorAccessorW accessor) { + using T = real_type_t
; int volume = accessor.shape.get_volume(); std::vector local_data(volume); T const *src_ptr = accessor.get
(); - if (accessor.on_device) { + if (device_on_cpu(accessor.device_type)) { + memcpy(local_data.data(), src_ptr, volume * sizeof(T)); + } else { checkCUDA(cudaMemcpy(local_data.data(), src_ptr, volume * sizeof(T), cudaMemcpyDeviceToHost)); - } else { - memcpy(local_data.data(), src_ptr, volume * sizeof(T)); } return local_data; diff --git a/lib/local-execution/include/local-execution/local_cpu_allocator.h b/lib/local-execution/include/local-execution/local_cpu_allocator.h index d1e81facf2..cf6cfe35d1 100644 --- a/lib/local-execution/include/local-execution/local_cpu_allocator.h +++ b/lib/local-execution/include/local-execution/local_cpu_allocator.h @@ -12,6 +12,8 @@ struct LocalCPUAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: std::unordered_map> ptrs; }; diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h index d6f338fe14..f697337c52 100644 --- a/lib/local-execution/include/local-execution/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -12,8 +12,10 @@ struct TrackedAllocator : public IAllocator { ~TrackedAllocator() = default; void *allocate(size_t) override; - void *allocate_and_zero(size_t) override; void deallocate(void *) override; + + DeviceType get_allocation_device_type() const override; + size_t get_current_mem_usage(); private: diff --git a/lib/local-execution/src/local_cpu_allocator.cc b/lib/local-execution/src/local_cpu_allocator.cc index 4ca5f987a8..c4657e26b5 100644 --- a/lib/local-execution/src/local_cpu_allocator.cc +++ b/lib/local-execution/src/local_cpu_allocator.cc @@ -17,6 +17,10 @@ void LocalCPUAllocator::deallocate(void *ptr) { } } +DeviceType LocalCPUAllocator::get_allocation_device_type() const { + return DeviceType::CPU; +} + Allocator create_local_cpu_memory_allocator() { return Allocator::create(); } diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc index 54eca7e514..f61ed7bc7b 100644 --- a/lib/local-execution/src/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local_task_argument_accessor.cc @@ -25,7 +25,10 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( this->tensor_slots_backing.at(slot_grad_pair)); if (priv == Permissions::RO) { GenericTensorAccessorR readonly_tensor_backing = { - tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}; + tensor_backing.data_type, + tensor_backing.shape, + tensor_backing.ptr, + this->allocator.get_allocation_device_type()}; return readonly_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { return tensor_backing; @@ -33,6 +36,7 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv)); } } + VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( slot_id_t slot, Permissions priv, IsGrad is_grad) const { SlotGradId slot_grad_pair = SlotGradId{slot, is_grad}; @@ -43,7 +47,10 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( for (GenericTensorAccessorW const &tensor_backing : variadic_tensor_backing) { readonly_variadic_tensor_backing.push_back( - {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}); + {tensor_backing.data_type, + tensor_backing.shape, + tensor_backing.ptr, + this->allocator.get_allocation_device_type()}); } return readonly_variadic_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index 9f13f006f3..ed181aea32 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -12,12 +12,6 @@ void *TrackedAllocator::allocate(size_t requested_memory_size) { return ptr; } -void *TrackedAllocator::allocate_and_zero(size_t requested_memory_size) { - void *ptr = this->allocator.allocate_and_zero(requested_memory_size); - this->current_mem_usage += requested_memory_size; - return ptr; -} - void TrackedAllocator::deallocate(void *ptr) { size_t psize; this->ptr_mem_usage.erase(ptr); @@ -29,9 +23,12 @@ size_t TrackedAllocator::get_current_mem_usage() { return this->current_mem_usage; } +DeviceType TrackedAllocator::get_allocation_device_type() const { + return this->allocator.get_allocation_device_type(); +} + Allocator get_tracked_memory_allocator(Allocator const &base_allocator) { Allocator allocator = Allocator::create(base_allocator); - allocator.alloc_location = base_allocator.alloc_location; return allocator; } From 0304f17e77563c35ca9daa6c3c6bcd9a4a5bb2a1 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 8 Oct 2024 00:26:05 -0700 Subject: [PATCH 08/42] accessor.h formatting --- lib/kernels/include/kernels/accessor.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 846115060f..e63e77d0ad 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -31,7 +31,10 @@ class GenericTensorAccessorW { GenericTensorAccessorW() = delete; - GenericTensorAccessorW(DataType data_type, ArrayShape const &shape, void *ptr, DeviceType device_type); + GenericTensorAccessorW(DataType data_type, + ArrayShape const &shape, + void *ptr, + DeviceType device_type); bool operator==(GenericTensorAccessorW const &) const; bool operator!=(GenericTensorAccessorW const &) const; From 7c3ff87421165a22d8e20dee5eaafb4bd3aa51f5 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 8 Oct 2024 00:47:55 -0700 Subject: [PATCH 09/42] mk_runtime_error formatting --- lib/kernels/include/kernels/accessor.h | 10 +++++----- lib/kernels/src/accessor.cc | 16 ++++++++-------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index e63e77d0ad..e29f73924c 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -45,8 +45,8 @@ class GenericTensorAccessorW { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } if (this->data_type != DT) { - throw mk_runtime_error( - "Invalid access data type ({} != {})", this->data_type, DT); + throw mk_runtime_error(fmt::format( + "Invalid access data type ({} != {})", this->data_type, DT)); } using T = real_type_t
; @@ -63,8 +63,8 @@ class GenericTensorAccessorW { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } if (this->data_type != DT) { - throw mk_runtime_error( - "Invalid access data type ({} != {})", this->data_type, DT); + throw mk_runtime_error(fmt::format( + "Invalid access data type ({} != {})", this->data_type, DT)); } using T = real_type_t
; @@ -130,7 +130,7 @@ class GenericTensorAccessorR { } if (this->data_type != DT) { throw mk_runtime_error( - "Invalid access data type ({} != {})", this->data_type, DT); + fmt::format("Invalid access data type ({} != {})", this->data_type, DT)); } using T = real_type_t
; diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index c0b11a2299..a2b3e94d33 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -21,10 +21,10 @@ size_t GenericTensorAccessorW::calculate_index_offset( std::initializer_list const &indices) const { if (indices.size() != this->shape.num_dims()) { - throw mk_runtime_error( + throw mk_runtime_error(fmt::format( "Number of indices ({}) does not match the number of dimensions ({}).", indices.size(), - this->shape.num_dims()); + this->shape.num_dims())); } size_t offset = 0; @@ -36,11 +36,11 @@ size_t GenericTensorAccessorW::calculate_index_offset( cur_idx = *it--; if (cur_idx >= this->shape[legion_dim_t(i)]) { - throw mk_runtime_error("In {} dimension, attempting to access index {} " + throw mk_runtime_error(fmt::format("In {} dimension, attempting to access index {} " "when only {} indexes exist", i, cur_idx, - this->shape[legion_dim_t(i)]); + this->shape[legion_dim_t(i)])); } offset += cur_idx * multiplier; @@ -110,10 +110,10 @@ size_t GenericTensorAccessorR::calculate_index_offset( std::initializer_list const &indices) const { if (indices.size() != this->shape.num_dims()) { - throw mk_runtime_error( + throw mk_runtime_error(fmt::format( "Number of indices ({}) does not match the number of dimensions ({}).", indices.size(), - this->shape.num_dims()); + this->shape.num_dims())); } size_t offset = 0; @@ -125,11 +125,11 @@ size_t GenericTensorAccessorR::calculate_index_offset( cur_idx = *it--; if (cur_idx >= this->shape[legion_dim_t(i)]) { - throw mk_runtime_error("In {} dimension, attempting to access index {} " + throw mk_runtime_error(fmt::format("In {} dimension, attempting to access index {} " "when only {} indexes exist", i, cur_idx, - this->shape[legion_dim_t(i)]); + this->shape[legion_dim_t(i)])); } offset += cur_idx * multiplier; From 65d78049c2d2cb933e5cf2be9545ee00693a9b97 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 8 Oct 2024 01:08:59 -0700 Subject: [PATCH 10/42] reverse_kernels include --- lib/kernels/src/cpu/reverse_kernels.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index 1971435d8c..afa92b307c 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -1,4 +1,5 @@ #include "kernels/reverse_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" #include #include From 7c5fb1fa853fb91f0245a84910c0aa86a2f89db4 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 14 Oct 2024 23:40:12 -0700 Subject: [PATCH 11/42] test_utils refactor and clarity --- lib/kernels/include/kernels/accessor.h | 22 ++- lib/kernels/include/kernels/allocation.h | 2 +- .../include/kernels/replicate_kernels.h | 2 +- .../include/kernels/replicate_kernels_cpu.h | 6 +- .../include/kernels/reverse_kernels_cpu.h | 14 +- lib/kernels/src/accessor.cc | 130 ++++++++++++++---- lib/kernels/src/allocation.cc | 6 +- lib/kernels/src/cpu/replicate_kernels.cc | 53 +++---- lib/kernels/src/cpu/reverse_kernels.cc | 63 +++------ lib/kernels/src/cuda/ops/replicate_kernels.cu | 6 +- lib/kernels/src/cuda/ops/reverse_kernels.cu | 13 +- lib/kernels/test/src/test_attention_kernel.cc | 26 ++-- .../test/src/test_batch_matmul_kernel.cc | 12 +- .../test/src/test_batch_norm_kernel.cc | 15 +- lib/kernels/test/src/test_cast_kernel.cc | 49 ------- lib/kernels/test/src/test_dropout.cc | 6 +- lib/kernels/test/src/test_gather_kernels.cc | 3 +- .../test/src/test_layer_norm_kernels.cc | 3 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 6 +- lib/kernels/test/src/test_replicate_kernel.cc | 80 ----------- lib/kernels/test/src/test_reverse_kernels.cc | 102 +------------- lib/kernels/test/src/test_softmax_kernel.cc | 6 +- lib/kernels/test/src/test_split_kernel.cc | 6 +- lib/kernels/test/src/test_transpose_kernel.cc | 3 +- lib/kernels/test/src/test_utils.cc | 77 +++++++++++ lib/kernels/test/src/test_utils.h | 130 ++++-------------- .../src/local_task_argument_accessor.cc | 12 +- lib/local-execution/src/ops/replicate.cc | 2 +- 28 files changed, 333 insertions(+), 522 deletions(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index e29f73924c..0a134db695 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -11,6 +11,8 @@ namespace FlexFlow { +struct Allocator; + class GenericTensorAccessorW { public: template @@ -129,8 +131,8 @@ class GenericTensorAccessorR { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } if (this->data_type != DT) { - throw mk_runtime_error( - fmt::format("Invalid access data type ({} != {})", this->data_type, DT)); + throw mk_runtime_error(fmt::format( + "Invalid access data type ({} != {})", this->data_type, DT)); } using T = real_type_t
; @@ -255,6 +257,22 @@ std::pair std::pair get_shape_and_datatype(GenericTensorAccessorW const &accessor); +void transfer_data_between_accessors( + GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor); + +void transfer_data_between_accessors( + GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorW const &src_accessor); + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator); + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator); + } // namespace FlexFlow namespace FlexFlow { diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h index 893be513ea..4bf97118ce 100644 --- a/lib/kernels/include/kernels/allocation.h +++ b/lib/kernels/include/kernels/allocation.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_ALLOCATION_H #define _FLEXFLOW_KERNELS_ALLOCATION_H -#include "accessor.h" +#include "kernels/accessor.h" #include #include diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h index 877eeabf04..7ed55cd1a1 100644 --- a/lib/kernels/include/kernels/replicate_kernels.h +++ b/lib/kernels/include/kernels/replicate_kernels.h @@ -11,8 +11,8 @@ void forward_kernel(ffStream_t stream, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas); } // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h index a72b799875..1c7aa4ee4a 100644 --- a/lib/kernels/include/kernels/replicate_kernels_cpu.h +++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h @@ -7,10 +7,10 @@ namespace FlexFlow::Kernels::Replicate { void cpu_forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); + GenericTensorAccessorW &output); -void cpu_backward_kernel(GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output, +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, size_t num_replicas); } // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h index b0edaa264c..35af06aafb 100644 --- a/lib/kernels/include/kernels/reverse_kernels_cpu.h +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -1,22 +1,16 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H -#include "accessor.h" -#include "device.h" +#include "kernels/accessor.h" +#include "kernels/device.h" namespace FlexFlow::Kernels::Reverse { void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, - GenericTensorAccessorW &output_accessor, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size); + GenericTensorAccessorW &output_accessor); void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, - GenericTensorAccessorW &input_accessor, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size); + GenericTensorAccessorW &input_accessor); } // namespace FlexFlow::Kernels::Reverse #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index a2b3e94d33..9332dd6703 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -1,7 +1,45 @@ #include "kernels/accessor.h" +#include "kernels/allocation.h" +#include "kernels/datatype_dispatch.h" namespace FlexFlow { +void transfer_data_between_accessors( + GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor) { + size_t num_bytes = dst_accessor.shape.get_volume() * + size_of_datatype(dst_accessor.data_type); + + DeviceType dst_device_type = dst_accessor.device_type; + DeviceType src_device_type = src_accessor.device_type; + + if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::CPU) { + memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes); + } else if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::GPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice)); + } else if (src_device_type == DeviceType::GPU && + dst_device_type == DeviceType::CPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost)); + } else { + checkCUDA(cudaMemcpy(dst_accessor.ptr, + src_accessor.ptr, + num_bytes, + cudaMemcpyDeviceToDevice)); + } +} + +void transfer_data_between_accessors( + GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorW const &src_accessor) { + GenericTensorAccessorR r_src_accessor = + read_only_accessor_from_write_accessor(src_accessor); + transfer_data_between_accessors(dst_accessor, r_src_accessor); +} + GenericTensorAccessorW::GenericTensorAccessorW( DataType data_type, ArrayShape const &shape, @@ -30,21 +68,22 @@ size_t GenericTensorAccessorW::calculate_index_offset( size_t offset = 0; size_t multiplier = 1; size_t cur_idx; - auto it = indices.end() - 1; - - for (std::size_t i = this->shape.num_dims(); i-- > 0;) { - cur_idx = *it--; - - if (cur_idx >= this->shape[legion_dim_t(i)]) { - throw mk_runtime_error(fmt::format("In {} dimension, attempting to access index {} " - "when only {} indexes exist", - i, - cur_idx, - this->shape[legion_dim_t(i)])); + auto it = indices.begin(); + + for (size_t i = 0; i < this->shape.num_dims(); i++) { + cur_idx = *it++; + + if (cur_idx >= this->shape.at(legion_dim_t(i))) { + throw mk_runtime_error( + fmt::format("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + cur_idx, + this->shape.at(legion_dim_t(i)))); } offset += cur_idx * multiplier; - multiplier *= this->shape[legion_dim_t(i)]; + multiplier *= this->shape.at(legion_dim_t(i)); } return offset; @@ -119,21 +158,22 @@ size_t GenericTensorAccessorR::calculate_index_offset( size_t offset = 0; size_t multiplier = 1; size_t cur_idx; - auto it = indices.end() - 1; - - for (std::size_t i = this->shape.num_dims(); i-- > 0;) { - cur_idx = *it--; - - if (cur_idx >= this->shape[legion_dim_t(i)]) { - throw mk_runtime_error(fmt::format("In {} dimension, attempting to access index {} " - "when only {} indexes exist", - i, - cur_idx, - this->shape[legion_dim_t(i)])); + auto it = indices.begin(); + + for (size_t i = 0; i < this->shape.num_dims(); i++) { + cur_idx = *it++; + + if (cur_idx >= this->shape.at(legion_dim_t(i))) { + throw mk_runtime_error( + fmt::format("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + cur_idx, + this->shape.at(legion_dim_t(i)))); } offset += cur_idx * multiplier; - multiplier *= this->shape[legion_dim_t(i)]; + multiplier *= this->shape.at(legion_dim_t(i)); } return offset; @@ -307,4 +347,46 @@ std::pair return std::make_pair(accessor.shape, accessor.data_type); } +template +struct CopyTensorAccessorW { + GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + transfer_data_between_accessors(dst_accessor, src_accessor); + + return dst_accessor; + } +}; + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, std::ref(allocator)); +} + +template +struct CopyTensorAccessorR { + GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + transfer_data_between_accessors(dst_accessor, src_accessor); + + return read_only_accessor_from_write_accessor(dst_accessor); + } +}; + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, std::ref(allocator)); +} + } // namespace FlexFlow diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index 751cdc0ebb..733146851a 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -18,8 +18,10 @@ DeviceType Allocator::get_allocation_device_type() const { GenericTensorAccessorW Allocator::allocate_tensor(TensorShape const &tensor_shape) { void *ptr = this->allocate(get_size_in_bytes(tensor_shape)); - return { - tensor_shape.data_type, tensor_shape, ptr, get_allocation_device_type()}; + return {tensor_shape.data_type, + tensor_shape, + ptr, + this->get_allocation_device_type()}; } } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index 5853869047..683739b91e 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -3,52 +3,43 @@ namespace FlexFlow::Kernels::Replicate { -template -void cpu_replicate_backward_kernel(T *input, - T const *output, - size_t num_elements, - size_t num_replicas) { - for (size_t i = 0; i < num_elements; i++) { - T sum = 0; - for (size_t j = 0; j < num_replicas; j++) { - sum += output[i + j * num_elements]; - } - input[i] = sum; - } -} - -template +template struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - memcpy(output.get(), - input.get(), - input.shape.num_elements() * size_of_datatype(T)); + GenericTensorAccessorW &output) { + memcpy(output.get
(), + input.get
(), + input.shape.num_elements() * size_of_datatype(DT)); } }; -template +template struct CPUBackwardKernel { - void operator()(GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output, + void operator()(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, size_t num_replicas) { - cpu_replicate_backward_kernel(input.get(), - output.get(), - input.shape.num_elements(), - num_replicas); + using T = real_type_t
; + for (size_t i = 0; i < input.shape.num_elements(); i++) { + T cur_sum = 0; + for (size_t j = 0; j < num_replicas; j++) { + cur_sum += output.at
(i, j); + } + input.at
(i) = cur_sum; + } } }; void cpu_forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(input.data_type, input, output); + GenericTensorAccessorW &output) { + DataTypeDispatch1{}( + input.data_type, input, std::ref(output)); } -void cpu_backward_kernel(GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output, +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, size_t num_replicas) { DataTypeDispatch1{}( - input.data_type, input, output, num_replicas); + input.data_type, output, std::ref(input), num_replicas); } } // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index afa92b307c..bc114c4e60 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -1,5 +1,5 @@ -#include "kernels/reverse_kernels_cpu.h" #include "kernels/datatype_dispatch.h" +#include "kernels/reverse_kernels_cpu.h" #include #include @@ -8,31 +8,20 @@ namespace FlexFlow::Kernels::Reverse { template struct CPUReverseForwardKernel { void operator()(GenericTensorAccessorR const &input, - GenericTensorAccessorW &output, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size) { + GenericTensorAccessorW &output) { assert(input.data_type == DT && output.data_type == DT); - // For each output block, copy the input block - for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { - for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) { - for (coord_t i = 0; i < in_blk_size; ++i) { - output.at
(blk_idx, rev_idx, i) = - input.at
(blk_idx, rev_idx, i); - } - } - } - - // Reverse the blocks within each output block - for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { - for (coord_t rev_idx = 0; rev_idx < reverse_dim_size / 2; ++rev_idx) { - coord_t start_idx = rev_idx; - coord_t end_idx = reverse_dim_size - 1 - rev_idx; - - for (coord_t i = 0; i < in_blk_size; ++i) { - std::swap(output.at
(blk_idx, start_idx, i), - output.at
(blk_idx, end_idx, i)); + coord_t num_out_blocks = input.shape.at(legion_dim_t(0)); + coord_t reverse_dim_size = input.shape.at(legion_dim_t(1)); + coord_t in_block_size = input.shape.at(legion_dim_t(2)); + + for (coord_t block_idx = 0; block_idx < num_out_blocks; block_idx++) { + for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { + for (coord_t i = 0; i < in_block_size; i++) { + output.at
(block_idx, rev_idx, i) = + input.at
(num_out_blocks - 1 - block_idx, + reverse_dim_size - 1 - rev_idx, + in_block_size - 1 - i); } } } @@ -40,29 +29,15 @@ struct CPUReverseForwardKernel { }; void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, - GenericTensorAccessorW &output_accessor, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size) { - DataTypeDispatch1{}(input_accessor.data_type, - input_accessor, - std::ref(output_accessor), - num_out_blks, - reverse_dim_size, - in_blk_size); + GenericTensorAccessorW &output_accessor) { + DataTypeDispatch1{}( + input_accessor.data_type, input_accessor, std::ref(output_accessor)); } void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, - GenericTensorAccessorW &input_accessor, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size) { - DataTypeDispatch1{}(output_accessor.data_type, - output_accessor, - std::ref(input_accessor), - num_out_blks, - reverse_dim_size, - in_blk_size); + GenericTensorAccessorW &input_accessor) { + DataTypeDispatch1{}( + output_accessor.data_type, output_accessor, std::ref(input_accessor)); } } // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu index 76bfbe2658..1aa61375f0 100644 --- a/lib/kernels/src/cuda/ops/replicate_kernels.cu +++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu @@ -50,8 +50,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas) { size_t total_elements = input.shape.num_elements() * num_replicas; replicate_backward_kernel> @@ -70,11 +70,11 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas) { DataTypeDispatch1{}( - input.data_type, stream, input, output, num_replicas); + input.data_type, stream, output, input, num_replicas); } } // namespace Replicate diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index f73c57dedf..8e93fec0d6 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -27,6 +27,7 @@ namespace Reverse { // coord_t reverse_dim_size, // coord_t in_blk_size) { // CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { +// coord_t out_idx = i; // coord_t blk_idx = i / (reverse_dim_size * in_blk_size); // i = i - blk_idx * (reverse_dim_size * in_blk_size); // coord_t reverse_dim_idx = i / in_blk_size; @@ -34,8 +35,18 @@ namespace Reverse { // coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + // (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + // i; -// out_ptr[i] = in_ptr[in_idx]; +// out_ptr[out_idx] = in_ptr[in_idx]; // } +// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { +// coord_t blk_idx = i / (reverse_dim_size * in_blk_size); +// i = i - blk_idx * (reverse_dim_size * in_blk_size); +// coord_t reverse_dim_idx = i / in_blk_size; +// i = i - reverse_dim_idx * in_blk_size; +// coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + +// (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + +// i; +// out_ptr[i] = in_ptr[in_idx]; +// } // } /* I mentioned this earlier, but I still think the reverse_forward_kernel code diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index c4a3f7bd50..023233ecb0 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -47,16 +47,13 @@ TEST_SUITE(FF_TEST_SUITE) { make_tensor_shape_from_legion_dims({state.weightSize}, DataType::FLOAT); GenericTensorAccessorW query_accessor = - create_random_filled_accessor_w(query_shape, - allocator); + create_random_filled_accessor_w(query_shape, allocator); GenericTensorAccessorW key_accessor = - create_random_filled_accessor_w(key_shape, allocator); + create_random_filled_accessor_w(key_shape, allocator); GenericTensorAccessorW value_accessor = - create_random_filled_accessor_w(value_shape, - allocator); + create_random_filled_accessor_w(value_shape, allocator); GenericTensorAccessorW weight_accessor = - create_random_filled_accessor_w(weight_shape, - allocator); + create_random_filled_accessor_w(weight_shape, allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -76,20 +73,15 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW query_grad_accessor = - create_random_filled_accessor_w(query_shape, - allocator); + create_random_filled_accessor_w(query_shape, allocator); GenericTensorAccessorW key_grad_accessor = - create_random_filled_accessor_w(key_shape, - allocator); + create_random_filled_accessor_w(key_shape, allocator); GenericTensorAccessorW value_grad_accessor = - create_random_filled_accessor_w(value_shape, - allocator); + create_random_filled_accessor_w(value_shape, allocator); GenericTensorAccessorW weight_grad_accessor = - create_random_filled_accessor_w(weight_shape, - allocator); + create_random_filled_accessor_w(weight_shape, allocator); GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); Kernels::MultiHeadAttention::backward_kernel( managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index bb9c4c07bd..8a11a069f5 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -29,14 +29,11 @@ TEST_SUITE(FF_TEST_SUITE) { make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT); GenericTensorAccessorW a_accessor = - create_random_filled_accessor_w(input_shape_a, - allocator); + create_random_filled_accessor_w(input_shape_a, allocator); GenericTensorAccessorW b_accessor = - create_random_filled_accessor_w(input_shape_b, - allocator); + create_random_filled_accessor_w(input_shape_b, allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { Kernels::BatchMatmul::forward_kernel(managed_stream.raw_stream(), @@ -55,8 +52,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW o_grad_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW a_grad_accessor = allocator.allocate_tensor(input_shape_a); GenericTensorAccessorW b_grad_accessor = diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 43bcc5528a..03a3a1ad40 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -36,8 +36,7 @@ TEST_SUITE(FF_TEST_SUITE) { {output_n, output_c, output_h, output_w}, DataType::FLOAT); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW scale_accessor = create_filled_accessor_w( @@ -59,17 +58,13 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW scale_grad_accessor = - create_random_filled_accessor_w(scale_shape, - allocator); + create_random_filled_accessor_w(scale_shape, allocator); GenericTensorAccessorW bias_grad_accessor = - create_random_filled_accessor_w(bias_shape, - allocator); + create_random_filled_accessor_w(bias_shape, allocator); Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index e9674cd167..1be5839a9c 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -75,53 +75,4 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); } } - - TEST_CASE("Check Cast Forward Kernel against CPU Kernel") { - ManagedFFStream managed_stream{}; - - Allocator gpu_allocator = create_local_cuda_memory_allocator(); - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - - TensorShape input_shape = - make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); - TensorShape output_shape = - make_tensor_shape_from_legion_dims({100, 100}, DataType::INT32); - - GenericTensorAccessorW output_accessor_gpu = - gpu_allocator.allocate_tensor(output_shape); - GenericTensorAccessorW output_accessor_cpu = - cpu_allocator.allocate_tensor(output_shape); - - // Only calling forward kernel as backward kernel is exactly the same - SUBCASE("forward_kernel") { - // Run GPU Forward Kernel - GenericTensorAccessorW input_accessor_gpu = - create_random_filled_accessor_w(input_shape, - gpu_allocator); - Kernels::Cast::forward_kernel( - managed_stream.raw_stream(), - read_only_accessor_from_write_accessor(input_accessor_gpu), - output_accessor_gpu, - DataType::FLOAT, - DataType::INT32); - - std::vector result_data_gpu = - load_accessor_data(output_accessor_gpu); - - // Run CPU Forward Kernel - GenericTensorAccessorW input_accessor_cpu = - create_random_filled_accessor_w(input_shape, - cpu_allocator); - Kernels::Cast::cpu_forward_kernel( - read_only_accessor_from_write_accessor(input_accessor_cpu), - output_accessor_cpu, - DataType::FLOAT, - DataType::INT32); - - std::vector result_data_cpu = - load_accessor_data(output_accessor_cpu); - - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); - } - } } diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 7ff364bada..4be2bdf7bb 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -47,11 +47,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_data = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_data = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Dropout::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 4f05c89813..7f97563217 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -41,8 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Gather::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 3ac0e1425f..7d7298f83d 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -56,8 +56,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW gamma_grad_accessor = allocator.allocate_tensor(feature_shape); GenericTensorAccessorW beta_grad_accessor = diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index f71d9cfa11..00fa968235 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -45,11 +45,9 @@ TEST_SUITE(FF_TEST_SUITE) { {output_w, output_h, output_c, output_n}, DataType::FLOAT); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { Kernels::Pool2D::forward_kernel(managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 8af741b3a7..27223cc7b5 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -110,84 +110,4 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor_cpu)); } } - - TEST_CASE("Check Replicate Forward Kernel against CPU Kernel") { - std::size_t num_replicas = 2; - - TensorShape input_shape = - make_tensor_shape_from_legion_dims({5}, DataType::FLOAT); - TensorShape output_shape = - make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT); - - ManagedPerDeviceFFHandle managed_handle{}; - ManagedFFStream managed_stream{}; - - Allocator gpu_allocator = create_local_cuda_memory_allocator(); - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - - SUBCASE("forward_kernel") { - // Run GPU Replicate Forward Kernel - GenericTensorAccessorR input_accessor_gpu = - create_random_filled_accessor_r(input_shape, - gpu_allocator); - GenericTensorAccessorW output_accessor_gpu = - gpu_allocator.allocate_tensor(output_shape); - fill_with_zeros(output_accessor_gpu); - - Kernels::Replicate::forward_kernel( - managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); - - std::vector result_data_gpu = - load_accessor_data(output_accessor_gpu); - - // Run CPU Replicate Forward Kernel - GenericTensorAccessorR input_accessor_cpu = - copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); - GenericTensorAccessorW output_accessor_cpu = - cpu_allocator.allocate_tensor(output_shape); - fill_with_zeros(output_accessor_cpu); - - Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, - output_accessor_cpu); - - std::vector result_data_cpu = - load_accessor_data(output_accessor_cpu); - - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); - } - - SUBCASE("backward_kernel") { - // Run GPU Replicate Backward Kernel - GenericTensorAccessorR output_grad_accessor_gpu = - create_random_filled_accessor_r(output_shape, - gpu_allocator); - GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor(input_shape); - fill_with_zeros(input_grad_accessor_gpu); - - Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor_gpu, - output_grad_accessor_gpu, - num_replicas); - - std::vector result_data_gpu = - load_accessor_data(input_grad_accessor_gpu); - - // Run CPU Replicate Backward Kernel - GenericTensorAccessorR output_grad_accessor_cpu = - copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); - - GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor(input_shape); - fill_with_zeros(input_grad_accessor_cpu); - - Kernels::Replicate::cpu_backward_kernel( - input_grad_accessor_cpu, output_grad_accessor_cpu, num_replicas); - - std::vector result_data_cpu = - load_accessor_data(input_grad_accessor_cpu); - - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); - } - } } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index b1f90a0a7e..4adf79847a 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -42,8 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -137,103 +136,4 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor_cpu)); } } - - TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { - std::size_t num_out_blks = 2; - std::size_t reverse_dim_size = 3; - std::size_t in_blk_size = 5; - - TensorShape input_shape = make_tensor_shape_from_legion_dims( - {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); - TensorShape output_shape = input_shape; - - ManagedPerDeviceFFHandle managed_handle{}; - ManagedFFStream managed_stream{}; - - Allocator gpu_allocator = create_local_cuda_memory_allocator(); - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - - SUBCASE("forward_kernel") { - auto transform = [counter = 0.0f](float val) mutable { - return counter++; - }; - - // Run GPU Cast Forward Kernel - GenericTensorAccessorR input_accessor_gpu = - create_random_filled_accessor_r(input_shape, - gpu_allocator); - GenericTensorAccessorW output_accessor_gpu = - gpu_allocator.allocate_tensor(output_shape); - fill_with_zeros(output_accessor_gpu); - - Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), - input_accessor_gpu.get_float_ptr(), - output_accessor_gpu.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - input_accessor_gpu.shape.num_elements()); - - std::vector result_data_gpu = - load_accessor_data(output_accessor_gpu); - - // Run CPU Cast Forward Kernel - GenericTensorAccessorR input_accessor_cpu = - copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); - GenericTensorAccessorW output_accessor_cpu = - cpu_allocator.allocate_tensor(output_shape); - fill_with_zeros(output_accessor_cpu); - - Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu, - output_accessor_cpu, - num_out_blks, - reverse_dim_size, - in_blk_size); - - std::vector result_data_cpu = - load_accessor_data(output_accessor_cpu); - - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); - } - - SUBCASE("backward_kernel") { - // Run GPU Cast Backward Kernel - GenericTensorAccessorR output_grad_accessor_gpu = - create_random_filled_accessor_r(output_shape, - gpu_allocator); - GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor(input_shape); - fill_with_zeros(input_grad_accessor_gpu); - - Kernels::Reverse::backward_kernel( - managed_stream.raw_stream(), - output_grad_accessor_gpu.get_float_ptr(), - input_grad_accessor_gpu.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - input_grad_accessor_gpu.shape.num_elements()); - - std::vector result_data_gpu = - load_accessor_data(input_grad_accessor_gpu); - - // Run CPU Cast Backward Kernel - GenericTensorAccessorR output_grad_accessor_cpu = - copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); - GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor(input_shape); - fill_with_zeros(input_grad_accessor_cpu); - - Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu, - input_grad_accessor_cpu, - num_out_blks, - reverse_dim_size, - in_blk_size); - - std::vector result_data_cpu = - load_accessor_data(input_grad_accessor_cpu); - - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); - } - } } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index 88f24a1a08..5519c30b80 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -23,13 +23,11 @@ TEST_SUITE(FF_TEST_SUITE) { managed_handle.raw_handle(), 0, input_n, channels, input_h, input_w); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Softmax::forward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index 9f1d390501..34993fa151 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -27,8 +27,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); std::vector output_ptrs = repeat(num_outputs, [&]() { GenericTensorAccessorW output_accessor = @@ -49,8 +48,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector output_grad_ptrs(num_outputs); for (int i = 0; i < num_outputs; i++) { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index c8baaac54f..0bc85cb8e0 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -39,8 +39,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index bfed1241ba..ca9e9e9c11 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -108,6 +108,83 @@ struct CPUAccessorRContainsNonZero { } }; +bool contains_non_zero(GenericTensorAccessorR const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + create_cpu_compatible_accessor_r(accessor, cpu_allocator); + return DataTypeDispatch1{}( + cpu_accessor.data_type, cpu_accessor); +} + +bool contains_non_zero(GenericTensorAccessorW const &accessor) { + GenericTensorAccessorR r_accessor = + read_only_accessor_from_write_accessor(accessor); + return contains_non_zero(r_accessor); +} + +GenericTensorAccessorR + create_cpu_compatible_accessor_r(GenericTensorAccessorR const &accessor, + Allocator &cpu_allocator) { + GenericTensorAccessorR cpu_accessor = accessor; + if (accessor.device_type == DeviceType::GPU) { + cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator); + } + return cpu_accessor; +} + +GenericTensorAccessorW + create_cpu_compatible_accessor_w(GenericTensorAccessorW const &accessor, + Allocator &cpu_allocator) { + GenericTensorAccessorW cpu_accessor = accessor; + if (accessor.device_type == DeviceType::GPU) { + cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator); + } + return cpu_accessor; +} + +template +struct PrintCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor) { + using T = real_type_t
; + + T const *data_ptr = accessor.get
(); + for (size_t i = 0; i < accessor.shape.num_elements(); i++) { + std::cout << data_ptr[i] << " "; + } + std::cout << "\n"; + } +}; + +void print_accessor(GenericTensorAccessorR const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + create_cpu_compatible_accessor_r(accessor, cpu_allocator); + DataTypeDispatch1{}(accessor.data_type, accessor); +} + +void print_accessor(GenericTensorAccessorW const &accessor) { + GenericTensorAccessorR r_accessor = + read_only_accessor_from_write_accessor(accessor); + print_accessor(r_accessor); +} + +template +struct CPUAccessorRContainsNonZero { + bool operator()(GenericTensorAccessorR const &accessor) { + using T = real_type_t
; + + T const *data_ptr = accessor.get
(); + + for (size_t i = 0; i < accessor.shape.num_elements(); i++) { + if (data_ptr[i] != 0) { + return true; + } + } + + return false; + } +}; + bool contains_non_zero(GenericTensorAccessorR const &accessor) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorR cpu_accessor = diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index a9d522b948..19599d2900 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -1,135 +1,59 @@ #ifndef _FLEXFLOW_KERNELS_TEST_UTILS #define _FLEXFLOW_KERNELS_TEST_UTILS +#include "kernels/copy_tensor_accessor.h" #include "kernels/datatype_dispatch.h" #include "kernels/device.h" #include "kernels/local_cpu_allocator.h" #include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" +#include "op-attrs/datatype.h" +#include "op-attrs/datatype_value.dtg.h" #include -#include #include #include #include using namespace FlexFlow; -template -void transfer_memory(GenericTensorAccessorW dst_accessor, - const DT *src, - DeviceType src_device_type) { - size_t bytes = dst_accessor.shape.get_volume() * sizeof(DT); - - DeviceType dst_device_type = dst_accessor.device_type; - - if (device_on_cpu(src_device_type) && device_on_cpu(dst_device_type)) { - memcpy(dst_accessor.ptr, src, bytes); - } else if (device_on_cpu(src_device_type) && device_on_gpu(dst_device_type)) { - checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyHostToDevice)); - } else if (device_on_gpu(src_device_type) && device_on_cpu(dst_device_type)) { - checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToHost)); - } else { - checkCUDA( - cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToDevice)); - } -} - -template GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator) { - assert(shape.data_type == DataType::FLOAT || - shape.data_type == DataType::DOUBLE); - - using T = real_type_t
; - - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - - std::vector host_data(accessor.shape.num_elements()); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dist(-1.0, 1.0); + Allocator &allocator); - for (auto &val : host_data) { - val = dist(gen); - } - - transfer_memory(accessor, host_data.data(), DeviceType::CPU); - - return accessor; -} - -template GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, - Allocator &allocator) { - using T = real_type_t
; - GenericTensorAccessorW accessor = - create_random_filled_accessor_w
(shape, allocator); + Allocator &allocator); - return read_only_accessor_from_write_accessor(accessor); -} +GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, + Allocator &allocator); -template -GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - T val) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); +TensorShape + make_tensor_shape_from_legion_dims(LegionOrdered const &dims, + DataType DT); - size_t volume = accessor.shape.get_volume(); - std::vector host_data(volume, val); +bool contains_non_zero(GenericTensorAccessorR const &accessor); - transfer_memory(accessor, host_data.data(), DeviceType::CPU); +void fill_with_zeros(GenericTensorAccessorW const &accessor); - return accessor; -} +GenericTensorAccessorW + copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor, + Allocator &allocator); -template -std::vector> - load_accessor_data(GenericTensorAccessorR accessor) { - using T = real_type_t
; - - int volume = accessor.shape.get_volume(); - std::vector local_data(volume); - T const *src_ptr = accessor.get
(); - - if (device_on_cpu(accessor.device_type)) { - memcpy(local_data.data(), src_ptr, volume * sizeof(T)); - } else { - checkCUDA(cudaMemcpy(local_data.data(), - src_ptr, - volume * sizeof(T), - cudaMemcpyDeviceToHost)); - } +GenericTensorAccessorR + copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, + Allocator &allocator); - return local_data; -} +void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor); -template -std::vector> - load_accessor_data(GenericTensorAccessorW accessor) { - using T = real_type_t
; - - int volume = accessor.shape.get_volume(); - std::vector local_data(volume); - T const *src_ptr = accessor.get
(); - - if (device_on_cpu(accessor.device_type)) { - memcpy(local_data.data(), src_ptr, volume * sizeof(T)); - } else { - checkCUDA(cudaMemcpy(local_data.data(), - src_ptr, - volume * sizeof(T), - cudaMemcpyDeviceToHost)); - } +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b); - return local_data; -} +GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val); -template -bool contains_non_zero(std::vector &data) { - return !all_of( - data.begin(), data.end(), [](T const &val) { return val == 0; }); -} +GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val); template std::vector repeat(std::size_t n, Func &&func) { diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc index f61ed7bc7b..5d099c6b46 100644 --- a/lib/local-execution/src/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local_task_argument_accessor.cc @@ -24,11 +24,8 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( auto tensor_backing = std::get( this->tensor_slots_backing.at(slot_grad_pair)); if (priv == Permissions::RO) { - GenericTensorAccessorR readonly_tensor_backing = { - tensor_backing.data_type, - tensor_backing.shape, - tensor_backing.ptr, - this->allocator.get_allocation_device_type()}; + GenericTensorAccessorR readonly_tensor_backing = + read_only_accessor_from_write_accessor(tensor_backing); return readonly_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { return tensor_backing; @@ -47,10 +44,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( for (GenericTensorAccessorW const &tensor_backing : variadic_tensor_backing) { readonly_variadic_tensor_backing.push_back( - {tensor_backing.data_type, - tensor_backing.shape, - tensor_backing.ptr, - this->allocator.get_allocation_device_type()}); + read_only_accessor_from_write_accessor(tensor_backing)); } return readonly_variadic_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc index 135475a711..56bbfdd371 100644 --- a/lib/local-execution/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -67,8 +67,8 @@ static std::optional return profile(backward_kernel, profiling, "[replicate] backward_time = {:.2lf}ms\n", - input_grad, output_grad, + input_grad, attrs.replicate_degree); } From 8188afe1e8e0149bb9685dcd15c65bdc0a23a27c Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 14 Oct 2024 23:41:17 -0700 Subject: [PATCH 12/42] formatting --- .envrc | 3 +++ .vimrc | 8 ++++++++ 2 files changed, 11 insertions(+) create mode 100644 .envrc create mode 100644 .vimrc diff --git a/.envrc b/.envrc new file mode 100644 index 0000000000..2797f0f929 --- /dev/null +++ b/.envrc @@ -0,0 +1,3 @@ +source_up_if_exists + +use flake diff --git a/.vimrc b/.vimrc new file mode 100644 index 0000000000..4c8a8a8279 --- /dev/null +++ b/.vimrc @@ -0,0 +1,8 @@ +" example search path configuration +set path=lib/runtime/**,lib/** + +" set build target +" let g:target = "pcg" + +" set test target +" let g:test_target = "utils-test" From a13255bacacb463fefdbc0d27a775d5828668a8e Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 14 Oct 2024 23:55:20 -0700 Subject: [PATCH 13/42] comment removal reverse_kernels --- lib/kernels/src/cuda/ops/reverse_kernels.cu | 48 +++------------------ 1 file changed, 6 insertions(+), 42 deletions(-) diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index 8e93fec0d6..2c25293c36 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -17,44 +17,9 @@ #include "kernels/reverse_kernels.h" namespace FlexFlow { - namespace Kernels { namespace Reverse { -// __global__ void reverse_forward_kernel(float const *in_ptr, -// float *out_ptr, -// coord_t num_out_blks, -// coord_t reverse_dim_size, -// coord_t in_blk_size) { -// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { -// coord_t out_idx = i; -// coord_t blk_idx = i / (reverse_dim_size * in_blk_size); -// i = i - blk_idx * (reverse_dim_size * in_blk_size); -// coord_t reverse_dim_idx = i / in_blk_size; -// i = i - reverse_dim_idx * in_blk_size; -// coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + -// (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + -// i; -// out_ptr[out_idx] = in_ptr[in_idx]; -// } -// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { -// coord_t blk_idx = i / (reverse_dim_size * in_blk_size); -// i = i - blk_idx * (reverse_dim_size * in_blk_size); -// coord_t reverse_dim_idx = i / in_blk_size; -// i = i - reverse_dim_idx * in_blk_size; -// coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + -// (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + -// i; -// out_ptr[i] = in_ptr[in_idx]; -// } -// } - -/* I mentioned this earlier, but I still think the reverse_forward_kernel code - is incorrect, even though it matches the code in inference/master? Whenever - I'm testing the code and printing out the output, I'm getting unexpected - outputs, and I think it's a result of modifying the loop index i in the - previous code? -*/ __global__ void reverse_forward_kernel(float const *in_ptr, float *out_ptr, coord_t num_out_blks, @@ -62,13 +27,12 @@ __global__ void reverse_forward_kernel(float const *in_ptr, coord_t in_blk_size) { CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { coord_t blk_idx = i / (reverse_dim_size * in_blk_size); - coord_t idx_within_blk = i % (reverse_dim_size * in_blk_size); - coord_t reverse_dim_idx = idx_within_blk / in_blk_size; - coord_t in_idx = idx_within_blk % in_blk_size; - coord_t input_index = - blk_idx * (reverse_dim_size * in_blk_size) + - (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + in_idx; - out_ptr[i] = in_ptr[input_index]; + i = i - blk_idx * (reverse_dim_size * in_blk_size); + coord_t reverse_dim_idx = i / in_blk_size; + i = i - reverse_dim_idx * in_blk_size; + coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + + (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i; + out_ptr[i] = in_ptr[in_idx]; } } From 7ed56247a30b41f1791c66fc8a4544507a383103 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 15 Oct 2024 19:22:55 -0700 Subject: [PATCH 14/42] Issue #1435, tests for managed stream and handle --- lib/kernels/src/managed_ff_stream.cc | 19 +++++++---- .../src/managed_per_device_ff_handle.cc | 33 +++++++++++------- .../test/src/test_managed_ff_stream.cc | 29 ++++++++++++++++ .../src/test_managed_per_device_ff_handle.cc | 34 +++++++++++++++++++ 4 files changed, 97 insertions(+), 18 deletions(-) create mode 100644 lib/kernels/test/src/test_managed_ff_stream.cc create mode 100644 lib/kernels/test/src/test_managed_per_device_ff_handle.cc diff --git a/lib/kernels/src/managed_ff_stream.cc b/lib/kernels/src/managed_ff_stream.cc index 7385b6cc3e..a8b44dc1d3 100644 --- a/lib/kernels/src/managed_ff_stream.cc +++ b/lib/kernels/src/managed_ff_stream.cc @@ -1,28 +1,35 @@ #include "kernels/managed_ff_stream.h" +#include "utils/exception.h" namespace FlexFlow { ManagedFFStream::ManagedFFStream() : stream(new ffStream_t) { - checkCUDA(cudaStreamCreate(stream)); + checkCUDA(cudaStreamCreate(this->stream)); } ManagedFFStream::ManagedFFStream(ManagedFFStream &&other) noexcept : stream(std::exchange(other.stream, nullptr)) {} ManagedFFStream &ManagedFFStream::operator=(ManagedFFStream &&other) noexcept { - std::swap(this->stream, other.stream); + if (this != &other) { + if (this->stream != nullptr) { + checkCUDA(cudaStreamDestroy(*this->stream)); + delete stream; + } + this->stream = std::exchange(other.stream, nullptr); + } return *this; } ManagedFFStream::~ManagedFFStream() { - if (stream != nullptr) { - checkCUDA(cudaStreamDestroy(*stream)); - delete stream; + if (this->stream != nullptr) { + checkCUDA(cudaStreamDestroy(*this->stream)); + delete this->stream; } } ffStream_t const &ManagedFFStream::raw_stream() const { - return *stream; + return *this->stream; } } // namespace FlexFlow diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index c050e887b6..ca105f9bc9 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -4,13 +4,13 @@ namespace FlexFlow { ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() { - handle = new PerDeviceFFHandle; - handle->workSpaceSize = 1024 * 1024; - handle->allowTensorOpMathConversion = true; + this->handle = new PerDeviceFFHandle; + this->handle->workSpaceSize = 1024 * 1024; + this->handle->allowTensorOpMathConversion = true; - checkCUDNN(cudnnCreate(&handle->dnn)); - checkCUBLAS(cublasCreate(&handle->blas)); - checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize)); + checkCUDNN(cudnnCreate(&this->handle->dnn)); + checkCUBLAS(cublasCreate(&this->handle->blas)); + checkCUDA(cudaMalloc(&this->handle->workSpace, this->handle->workSpaceSize)); } ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( @@ -19,16 +19,25 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( ManagedPerDeviceFFHandle &ManagedPerDeviceFFHandle::operator=( ManagedPerDeviceFFHandle &&other) noexcept { - std::swap(this->handle, other.handle); + if (this != &other) { + if (this->handle != nullptr) { + checkCUDNN(cudnnDestroy(this->handle->dnn)); + checkCUBLAS(cublasDestroy(this->handle->blas)); + checkCUDA(cudaFree(this->handle->workSpace)); + delete this->handle; + } + this->handle = std::exchange(other.handle, nullptr); + } return *this; } ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() { - if (handle != nullptr) { - checkCUDNN(cudnnDestroy(handle->dnn)); - checkCUBLAS(cublasDestroy(handle->blas)); - checkCUDA(cudaFree(handle->workSpace)); - delete handle; + if (this->handle != nullptr) { + checkCUDNN(cudnnDestroy(this->handle->dnn)); + checkCUBLAS(cublasDestroy(this->handle->blas)); + checkCUDA(cudaFree(this->handle->workSpace)); + delete this->handle; + this->handle = nullptr; } } diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc new file mode 100644 index 0000000000..1dc40f0a92 --- /dev/null +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -0,0 +1,29 @@ +#include "doctest/doctest.h" +#include "kernels/managed_ff_stream.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test Managed FF Stream") { + ManagedFFStream base_stream{}; + + SUBCASE("Test ManagedFFStream Move Constructor") { + ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); + + ManagedFFStream new_stream(std::move(base_stream)); + + CHECK(&base_stream.raw_stream() == nullptr); + CHECK(&new_stream.raw_stream() == base_stream_ptr); + } + + SUBCASE("Test ManagedFFStream Assignment Operator") { + ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); + + ManagedFFStream new_stream{}; + new_stream = std::move(base_stream); + + CHECK(&base_stream.raw_stream() == nullptr); + CHECK(&new_stream.raw_stream() == base_stream_ptr); + } + } +} diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc new file mode 100644 index 0000000000..d99d375a7c --- /dev/null +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -0,0 +1,34 @@ +#include "doctest/doctest.h" +#include "kernels/managed_per_device_ff_handle.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test Managed Per Device FF Handle") { + ManagedPerDeviceFFHandle base_handle{}; + + SUBCASE("Test ManagedPerDeviceFFHandle Constructor") { + CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024); + CHECK(base_handle.raw_handle().allowTensorOpMathConversion == true); + } + + SUBCASE("Test ManagedPerDeviceFFHandle Move Constructor") { + PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); + + ManagedPerDeviceFFHandle new_handle(std::move(base_handle)); + + CHECK(&base_handle.raw_handle() == nullptr); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } + + SUBCASE("Test ManagedPerDeviceFFHandle Assignment Operator") { + PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); + + ManagedPerDeviceFFHandle new_handle{}; + new_handle = std::move(base_handle); + + CHECK(&base_handle.raw_handle() == nullptr); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } + } +} From c1758c08a43b72f062faa599468c9d743a6cf318 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 15 Oct 2024 19:25:18 -0700 Subject: [PATCH 15/42] #1435 formatting --- lib/kernels/test/src/test_managed_ff_stream.cc | 6 +++--- lib/kernels/test/src/test_managed_per_device_ff_handle.cc | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc index 1dc40f0a92..1dedb0c41d 100644 --- a/lib/kernels/test/src/test_managed_ff_stream.cc +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -5,11 +5,11 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Managed FF Stream") { - ManagedFFStream base_stream{}; + ManagedFFStream base_stream{}; SUBCASE("Test ManagedFFStream Move Constructor") { ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); - + ManagedFFStream new_stream(std::move(base_stream)); CHECK(&base_stream.raw_stream() == nullptr); @@ -21,7 +21,7 @@ TEST_SUITE(FF_TEST_SUITE) { ManagedFFStream new_stream{}; new_stream = std::move(base_stream); - + CHECK(&base_stream.raw_stream() == nullptr); CHECK(&new_stream.raw_stream() == base_stream_ptr); } diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc index d99d375a7c..e85cfd61c7 100644 --- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -5,7 +5,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Managed Per Device FF Handle") { - ManagedPerDeviceFFHandle base_handle{}; + ManagedPerDeviceFFHandle base_handle{}; SUBCASE("Test ManagedPerDeviceFFHandle Constructor") { CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024); @@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Test ManagedPerDeviceFFHandle Move Constructor") { PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); - + ManagedPerDeviceFFHandle new_handle(std::move(base_handle)); CHECK(&base_handle.raw_handle() == nullptr); @@ -26,7 +26,7 @@ TEST_SUITE(FF_TEST_SUITE) { ManagedPerDeviceFFHandle new_handle{}; new_handle = std::move(base_handle); - + CHECK(&base_handle.raw_handle() == nullptr); CHECK(&new_handle.raw_handle() == base_handle_ptr); } From 54b3888eb36776eb3d99901463777c4d592ee064 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 15 Oct 2024 20:24:27 -0700 Subject: [PATCH 16/42] #1409 issue, change datatype for linear kernels away from void * --- lib/kernels/include/kernels/linear_kernels.h | 22 +++--- lib/kernels/src/cuda/ops/linear_kernels.cu | 76 +++++++++++--------- lib/local-execution/src/ops/linear.cc | 14 ++-- 3 files changed, 59 insertions(+), 53 deletions(-) diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h index 99549adece..cff6563629 100644 --- a/lib/kernels/include/kernels/linear_kernels.h +++ b/lib/kernels/include/kernels/linear_kernels.h @@ -50,23 +50,23 @@ bool use_activation(Activation activation); void forward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *output_ptr, - void const *filter_ptr, - void const *bias_ptr, + float const *input_ptr, + float *output_ptr, + float const *filter_ptr, + float const *bias_ptr, int in_dim, int out_dim, int batch_size); void backward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, - void const *output_ptr, - void *output_grad_ptr, - void const *kernel_ptr, - void *kernel_grad_ptr, - void *bias_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *output_ptr, + float *output_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_ptr, int in_dim, int out_dim, int batch_size); diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu index ca51f0d216..29b77fd9d9 100644 --- a/lib/kernels/src/cuda/ops/linear_kernels.cu +++ b/lib/kernels/src/cuda/ops/linear_kernels.cu @@ -108,10 +108,10 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, void forward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *output_ptr, - void const *weight_ptr, - void const *bias_ptr, + float const *input_ptr, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, int in_dim, int out_dim, int batch_size) { @@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream, batch_size, in_dim, &alpha, - weight_ptr, + (void *)weight_ptr, weight_type, in_dim, - input_ptr, + (void *)input_ptr, input_type, in_dim, &beta, - output_ptr, + (void *)output_ptr, output_type, out_dim, compute_type, @@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream, batch_size, 1, &alpha, - bias_ptr, + (void *)bias_ptr, weight_type, 1, - m.one_ptr, + (void *)m.one_ptr, CUDA_R_32F, 1, &alpha, - output_ptr, + (void *)output_ptr, output_type, out_dim, compute_type, @@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream, m.actiDesc, &alpha, m.outputTensor, - output_ptr, + (void *)output_ptr, &beta, m.outputTensor, - output_ptr)); + (void *)output_ptr)); } else if (m.activation == Activation::GELU) { size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size); constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) @@ -191,13 +191,13 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, - void const *output_ptr, - void *output_grad_ptr, - void const *kernel_ptr, - void *kernel_grad_ptr, - void *bias_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *output_ptr, + float *output_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_grad_ptr, int in_dim, int out_dim, int batch_size) { @@ -216,11 +216,17 @@ void backward_kernel(cudaStream_t stream, int output_size = out_dim * batch_size; if (m.activation.has_value()) { if (m.activation == Activation::RELU) { - relu_backward_kernel( - m.output_type, output_grad_ptr, output_ptr, output_size, stream); + relu_backward_kernel(m.output_type, + (void *)output_grad_ptr, + (void *)output_ptr, + output_size, + stream); } else if (m.activation == Activation::SIGMOID) { - sigmoid_backward_kernel( - m.output_type, output_grad_ptr, output_ptr, output_size, stream); + sigmoid_backward_kernel(m.output_type, + (void *)output_grad_ptr, + (void *)output_ptr, + output_size, + stream); } else { // TODO: only support relu and sigmoid for now assert(false && "Unsupported activation for Linear"); @@ -235,14 +241,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - input_ptr, + (void *)input_ptr, input_type, in_dim, - output_grad_ptr, + (void *)output_grad_ptr, output_type, out_dim, &alpha, - kernel_grad_ptr, + (void *)kernel_grad_ptr, weight_type, in_dim, compute_type, @@ -261,12 +267,12 @@ void backward_kernel(cudaStream_t stream, in_dim, out_dim, &alpha, - (float *)kernel_grad_ptr, + kernel_grad_ptr, in_dim, &lambda, - (float *)kernel_ptr, + kernel_ptr, in_dim, - (float *)kernel_grad_ptr, + kernel_grad_ptr, in_dim)); } else { assert(false && "Only L2 regularization is supported"); @@ -284,14 +290,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - m.one_ptr, + (void *)m.one_ptr, CUDA_R_32F, 1, - output_grad_ptr, + (void *)output_grad_ptr, output_type, out_dim, &alpha, - bias_grad_ptr, + (void *)bias_grad_ptr, weight_type, 1, compute_type, @@ -307,14 +313,14 @@ void backward_kernel(cudaStream_t stream, batch_size, out_dim, &alpha, - kernel_ptr, + (void *)kernel_ptr, weight_type, in_dim, - output_grad_ptr, + (void *)output_grad_ptr, output_type, out_dim, &alpha, - input_grad_ptr, + (void *)input_grad_ptr, input_type, in_dim, compute_type, diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 3e0b4672ab..4637cb388e 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -148,13 +148,13 @@ static std::optional profiling, "[Linear] backward_time = {:.2lf}ms\n", per_device_state, - (void *)input.get_float_ptr(), - (void *)input_grad.get_float_ptr(), - (void *)output.get_float_ptr(), - (void *)output_grad.get_float_ptr(), - (void *)weight.get_float_ptr(), - (void *)weight_grad.get_float_ptr(), - (void *)bias_ptr, + input.get_float_ptr(), + (float *)input_grad.get_float_ptr(), + output.get_float_ptr(), + (float *)output_grad.get_float_ptr(), + weight.get_float_ptr(), + (float *)weight_grad.get_float_ptr(), + (float *)bias_ptr, in_dim, out_dim, batch_size); From 5b5c2f6e6ea5d7198a5ac693d024970380e4cf34 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 4 Nov 2024 23:12:02 -0800 Subject: [PATCH 17/42] R & W accessor changes, minimize code bloat --- lib/kernels/include/kernels/accessor.h | 154 ++++++++---------- lib/kernels/include/kernels/cast_kernels.h | 8 +- .../include/kernels/cast_kernels_cpu.h | 8 +- .../include/kernels/datatype_dispatch.h | 10 +- .../kernels/managed_per_device_ff_handle.h | 5 +- lib/kernels/src/accessor.cc | 107 +++--------- lib/kernels/src/cpu/cast_kernels.cc | 13 +- lib/kernels/src/cpu/replicate_kernels.cc | 9 +- lib/kernels/src/cpu/reverse_kernels.cc | 24 +-- lib/kernels/src/cuda/ops/cast_kernels.cu | 12 +- lib/kernels/src/cuda/ops/linear_kernels.cu | 42 ++--- .../src/managed_per_device_ff_handle.cc | 8 +- .../test/src/test_managed_ff_stream.cc | 12 +- .../src/test_managed_per_device_ff_handle.cc | 14 +- lib/kernels/test/src/test_utils.cc | 77 --------- lib/local-execution/src/ops/cast.cc | 8 +- lib/local-execution/src/ops/linear.cc | 14 +- .../test/src/test_local_cost_estimator.cc | 2 +- 18 files changed, 171 insertions(+), 356 deletions(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 0a134db695..653c8db42d 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -13,54 +13,36 @@ namespace FlexFlow { struct Allocator; -class GenericTensorAccessorW { +class GenericTensorAccessorR { public: template - typename data_type_enum_to_class
::type *get() const { + typename data_type_enum_to_class
::type const *get() const { if (this->data_type == DT) { - return static_cast *>(this->ptr); + return static_cast const *>(this->ptr); } else { throw mk_runtime_error(fmt::format( "Invalid access data type ({} != {})", this->data_type, DT)); } } - int32_t *get_int32_ptr() const; - int64_t *get_int64_ptr() const; - float *get_float_ptr() const; - double *get_double_ptr() const; - half *get_half_ptr() const; + int32_t const *get_int32_ptr() const; + int64_t const *get_int64_ptr() const; + float const *get_float_ptr() const; + double const *get_double_ptr() const; + half const *get_half_ptr() const; - GenericTensorAccessorW() = delete; + GenericTensorAccessorR() = delete; - GenericTensorAccessorW(DataType data_type, + GenericTensorAccessorR(DataType data_type, ArrayShape const &shape, - void *ptr, + void const *ptr, DeviceType device_type); - bool operator==(GenericTensorAccessorW const &) const; - bool operator!=(GenericTensorAccessorW const &) const; - - template - real_type_t
&at(Indices... indices) { - if (this->device_type != DeviceType::CPU) { - throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); - } - if (this->data_type != DT) { - throw mk_runtime_error(fmt::format( - "Invalid access data type ({} != {})", this->data_type, DT)); - } - - using T = real_type_t
; - - T *data_ptr = static_cast(this->ptr); - size_t offset = calculate_index_offset({static_cast(indices)...}); - - return data_ptr[offset]; - } + bool operator==(GenericTensorAccessorR const &) const; + bool operator!=(GenericTensorAccessorR const &) const; - template - real_type_t
const &at(Indices... indices) const { + template + real_type_t
const &at(std::vector const &indices) const { if (this->device_type != DeviceType::CPU) { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } @@ -72,7 +54,7 @@ class GenericTensorAccessorW { using T = real_type_t
; T const *data_ptr = static_cast(this->ptr); - size_t offset = calculate_index_offset({static_cast(indices)...}); + size_t offset = calculate_index_offset(indices); return data_ptr[offset]; } @@ -80,7 +62,7 @@ class GenericTensorAccessorW { public: DataType data_type; ArrayShape shape; - void *ptr; + void const *ptr; DeviceType device_type; private: @@ -90,43 +72,62 @@ class GenericTensorAccessorW { decltype(device_type) const &> tie() const; - size_t calculate_index_offset( - std::initializer_list const &indices) const; + size_t calculate_index_offset(std::vector const &indices) const; }; -std::string format_as(GenericTensorAccessorW const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); +std::string format_as(GenericTensorAccessorR const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); -class GenericTensorAccessorR { +class GenericTensorAccessorW { public: template - typename data_type_enum_to_class
::type const *get() const { + typename data_type_enum_to_class
::type *get() const { if (this->data_type == DT) { - return static_cast const *>(this->ptr); + return static_cast *>(this->ptr); } else { throw mk_runtime_error(fmt::format( "Invalid access data type ({} != {})", this->data_type, DT)); } } - int32_t const *get_int32_ptr() const; - int64_t const *get_int64_ptr() const; - float const *get_float_ptr() const; - double const *get_double_ptr() const; - half const *get_half_ptr() const; + int32_t *get_int32_ptr() const; + int64_t *get_int64_ptr() const; + float *get_float_ptr() const; + double *get_double_ptr() const; + half *get_half_ptr() const; - GenericTensorAccessorR() = delete; + GenericTensorAccessorW() = delete; - GenericTensorAccessorR(DataType data_type, + GenericTensorAccessorW(DataType data_type, ArrayShape const &shape, - void const *ptr, + void *ptr, DeviceType device_type); - bool operator==(GenericTensorAccessorR const &) const; - bool operator!=(GenericTensorAccessorR const &) const; + bool operator==(GenericTensorAccessorW const &) const; + bool operator!=(GenericTensorAccessorW const &) const; + + operator GenericTensorAccessorR() const; + + template + real_type_t
&at(std::vector const &indices) { + if (this->device_type != DeviceType::CPU) { + throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); + } + if (this->data_type != DT) { + throw mk_runtime_error(fmt::format( + "Invalid access data type ({} != {})", this->data_type, DT)); + } + + using T = real_type_t
; + + T *data_ptr = static_cast(this->ptr); + size_t offset = calculate_index_offset(indices); + + return data_ptr[offset]; + } - template - real_type_t
const &at(Indices... indices) const { + template + real_type_t
&at(std::vector const &indices) const { if (this->device_type != DeviceType::CPU) { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } @@ -138,7 +139,7 @@ class GenericTensorAccessorR { using T = real_type_t
; T const *data_ptr = static_cast(this->ptr); - size_t offset = calculate_index_offset({static_cast(indices)...}); + size_t offset = calculate_index_offset(indices); return data_ptr[offset]; } @@ -146,7 +147,7 @@ class GenericTensorAccessorR { public: DataType data_type; ArrayShape shape; - void const *ptr; + void *ptr; DeviceType device_type; private: @@ -156,27 +157,11 @@ class GenericTensorAccessorR { decltype(device_type) const &> tie() const; - size_t calculate_index_offset( - std::initializer_list const &indices) const; + size_t calculate_index_offset(std::vector const &indices) const; }; -std::string format_as(GenericTensorAccessorR const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); - -int32_t *get_int32_ptr(GenericTensorAccessorW const &); -int64_t *get_int64_ptr(GenericTensorAccessorW const &); -float *get_float_ptr(GenericTensorAccessorW const &); -double *get_double_ptr(GenericTensorAccessorW const &); -half *get_half_ptr(GenericTensorAccessorW const &); -std::vector - get_int32_ptrs(std::vector const &); -std::vector - get_int64_ptrs(std::vector const &); -std::vector - get_float_ptrs(std::vector const &); -std::vector - get_double_ptrs(std::vector const &); -std::vector get_half_ptrs(std::vector const &); +std::string format_as(GenericTensorAccessorW const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); static_assert(is_fmtable const &>::value, ""); @@ -241,12 +226,8 @@ std::vector const *> GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &write_accessor); -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2); - -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype); +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2); bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, ArrayShape const &expected_shape, @@ -254,16 +235,9 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, std::pair get_shape_and_datatype(GenericTensorAccessorR const &accessor); -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor); - -void transfer_data_between_accessors( - GenericTensorAccessorW &dst_accessor, - GenericTensorAccessorR const &src_accessor); -void transfer_data_between_accessors( - GenericTensorAccessorW &dst_accessor, - GenericTensorAccessorW const &src_accessor); +void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor); GenericTensorAccessorR copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h index f67613cec6..21e76fed1d 100644 --- a/lib/kernels/include/kernels/cast_kernels.h +++ b/lib/kernels/include/kernels/cast_kernels.h @@ -8,15 +8,11 @@ namespace FlexFlow::Kernels::Cast { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h index 959617dcae..275476b4e6 100644 --- a/lib/kernels/include/kernels/cast_kernels_cpu.h +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -7,14 +7,10 @@ namespace FlexFlow::Kernels::Cast { void cpu_forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); void cpu_backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h index 0986d99791..50ca66a820 100644 --- a/lib/kernels/include/kernels/datatype_dispatch.h +++ b/lib/kernels/include/kernels/datatype_dispatch.h @@ -34,7 +34,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(Args... args) const { + Out operator()(Args &&...args) const { return F
{}(std::forward(args)...); } }; @@ -42,7 +42,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(DataType data_type, Args... args) { + Out operator()(DataType data_type, Args &&...args) { return dispatch(data_type, std::forward(args)...); } }; @@ -55,13 +55,13 @@ struct DataTypeDispatch2 { template struct OutputType { template - void operator()(Args... args) const { + void operator()(Args &&...args) const { F{}(std::forward(args)...); } }; template - void operator()(DataType output_type, Args... args) const { + void operator()(DataType output_type, Args &&...args) const { dispatch(output_type, std::forward(args)...); } }; @@ -69,7 +69,7 @@ struct DataTypeDispatch2 { template void operator()(DataType input_data_type, DataType output_data_type, - Args... args) { + Args &&...args) { dispatch( input_data_type, output_data_type, std::forward(args)...); } diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index 0a83a5eecb..f9f944c6ff 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -7,7 +7,10 @@ namespace FlexFlow { struct ManagedPerDeviceFFHandle { public: - ManagedPerDeviceFFHandle(); + ManagedPerDeviceFFHandle() = delete; + + ManagedPerDeviceFFHandle(size_t workSpaceSize, + bool allowTensorOpMathConversion); ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete; ManagedPerDeviceFFHandle & diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index 9332dd6703..4cb5bd83a2 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -4,7 +4,7 @@ namespace FlexFlow { -void transfer_data_between_accessors( +void copy_accessor_data_to_l_from_r( GenericTensorAccessorW &dst_accessor, GenericTensorAccessorR const &src_accessor) { size_t num_bytes = dst_accessor.shape.get_volume() * @@ -25,6 +25,8 @@ void transfer_data_between_accessors( checkCUDA(cudaMemcpy( dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost)); } else { + assert(src_device_type == DeviceType::GPU); + assert(src_device_type == DeviceType::CPU); checkCUDA(cudaMemcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes, @@ -32,12 +34,8 @@ void transfer_data_between_accessors( } } -void transfer_data_between_accessors( - GenericTensorAccessorW &dst_accessor, - GenericTensorAccessorW const &src_accessor) { - GenericTensorAccessorR r_src_accessor = - read_only_accessor_from_write_accessor(src_accessor); - transfer_data_between_accessors(dst_accessor, r_src_accessor); +GenericTensorAccessorW::operator GenericTensorAccessorR() const { + return read_only_accessor_from_write_accessor(*this); } GenericTensorAccessorW::GenericTensorAccessorW( @@ -56,7 +54,7 @@ std::tuple const &indices) const { + std::vector const &indices) const { if (indices.size() != this->shape.num_dims()) { throw mk_runtime_error(fmt::format( @@ -67,22 +65,18 @@ size_t GenericTensorAccessorW::calculate_index_offset( size_t offset = 0; size_t multiplier = 1; - size_t cur_idx; - auto it = indices.begin(); for (size_t i = 0; i < this->shape.num_dims(); i++) { - cur_idx = *it++; - - if (cur_idx >= this->shape.at(legion_dim_t(i))) { + if (indices[i] >= this->shape.at(legion_dim_t(i))) { throw mk_runtime_error( fmt::format("In {} dimension, attempting to access index {} " "when only {} indexes exist", i, - cur_idx, + indices[i], this->shape.at(legion_dim_t(i)))); } - offset += cur_idx * multiplier; + offset += indices[i] * multiplier; multiplier *= this->shape.at(legion_dim_t(i)); } @@ -146,7 +140,7 @@ std::tuple const &indices) const { + std::vector const &indices) const { if (indices.size() != this->shape.num_dims()) { throw mk_runtime_error(fmt::format( @@ -155,24 +149,20 @@ size_t GenericTensorAccessorR::calculate_index_offset( this->shape.num_dims())); } - size_t offset = 0; + ssize_t offset = 0; size_t multiplier = 1; - size_t cur_idx; - auto it = indices.begin(); for (size_t i = 0; i < this->shape.num_dims(); i++) { - cur_idx = *it++; - - if (cur_idx >= this->shape.at(legion_dim_t(i))) { + if (indices[i] >= this->shape.at(legion_dim_t(i))) { throw mk_runtime_error( fmt::format("In {} dimension, attempting to access index {} " "when only {} indexes exist", i, - cur_idx, + indices[i], this->shape.at(legion_dim_t(i)))); } - offset += cur_idx * multiplier; + offset += indices[i] * multiplier; multiplier *= this->shape.at(legion_dim_t(i)); } @@ -220,51 +210,6 @@ std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) { return (s << fmt::to_string(a)); } -int32_t *get_int32_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -int64_t *get_int64_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -float *get_float_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -double *get_double_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -half *get_half_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -std::vector - get_int32_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_int64_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_float_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_double_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_half_ptrs(std::vector const &a) { - return get(a); -} - int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) { return get(a); } @@ -318,18 +263,11 @@ GenericTensorAccessorR read_only_accessor_from_write_accessor( writable.device_type}; } -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2) { +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2) { return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; } -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype) { - return accessor.shape == expected_shape && - accessor.data_type == expected_dtype; -} - bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, ArrayShape const &expected_shape, DataType const &expected_dtype) { @@ -342,11 +280,6 @@ std::pair return std::make_pair(accessor.shape, accessor.data_type); } -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor) { - return std::make_pair(accessor.shape, accessor.data_type); -} - template struct CopyTensorAccessorW { GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, @@ -355,7 +288,7 @@ struct CopyTensorAccessorW { get_tensor_shape(src_accessor.shape, src_accessor.data_type); GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); - transfer_data_between_accessors(dst_accessor, src_accessor); + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); return dst_accessor; } @@ -365,7 +298,7 @@ GenericTensorAccessorW copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, Allocator &allocator) { return DataTypeDispatch1{}( - src_accessor.data_type, src_accessor, std::ref(allocator)); + src_accessor.data_type, src_accessor, allocator); } template @@ -376,7 +309,7 @@ struct CopyTensorAccessorR { get_tensor_shape(src_accessor.shape, src_accessor.data_type); GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); - transfer_data_between_accessors(dst_accessor, src_accessor); + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); return read_only_accessor_from_write_accessor(dst_accessor); } @@ -386,7 +319,7 @@ GenericTensorAccessorR copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, Allocator &allocator) { return DataTypeDispatch1{}( - src_accessor.data_type, src_accessor, std::ref(allocator)); + src_accessor.data_type, src_accessor, allocator); } } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc index 2d3f440c75..5a00503fe4 100644 --- a/lib/kernels/src/cpu/cast_kernels.cc +++ b/lib/kernels/src/cpu/cast_kernels.cc @@ -37,18 +37,15 @@ struct CPUBackwardKernel { }; void cpu_forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { - DataTypeDispatch2{}(input_type, output_type, input, output); + GenericTensorAccessorW const &output) { + DataTypeDispatch2{}( + input.data_type, output.data_type, input, output); } void cpu_backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input_type, output_type, input, output); + input.data_type, output.data_type, input, output); } } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index 683739b91e..25693b374d 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -22,24 +22,23 @@ struct CPUBackwardKernel { for (size_t i = 0; i < input.shape.num_elements(); i++) { T cur_sum = 0; for (size_t j = 0; j < num_replicas; j++) { - cur_sum += output.at
(i, j); + cur_sum += output.at
({i, j}); } - input.at
(i) = cur_sum; + input.at
({i}) = cur_sum; } } }; void cpu_forward_kernel(GenericTensorAccessorR const &input, GenericTensorAccessorW &output) { - DataTypeDispatch1{}( - input.data_type, input, std::ref(output)); + DataTypeDispatch1{}(input.data_type, input, output); } void cpu_backward_kernel(GenericTensorAccessorR const &output, GenericTensorAccessorW &input, size_t num_replicas) { DataTypeDispatch1{}( - input.data_type, output, std::ref(input), num_replicas); + input.data_type, output, input, num_replicas); } } // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index bc114c4e60..e5b3719d74 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -11,17 +11,17 @@ struct CPUReverseForwardKernel { GenericTensorAccessorW &output) { assert(input.data_type == DT && output.data_type == DT); - coord_t num_out_blocks = input.shape.at(legion_dim_t(0)); - coord_t reverse_dim_size = input.shape.at(legion_dim_t(1)); - coord_t in_block_size = input.shape.at(legion_dim_t(2)); + size_t num_out_blocks = input.shape.at(legion_dim_t(0)); + size_t reverse_dim_size = input.shape.at(legion_dim_t(1)); + size_t in_block_size = input.shape.at(legion_dim_t(2)); - for (coord_t block_idx = 0; block_idx < num_out_blocks; block_idx++) { - for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { - for (coord_t i = 0; i < in_block_size; i++) { - output.at
(block_idx, rev_idx, i) = - input.at
(num_out_blocks - 1 - block_idx, - reverse_dim_size - 1 - rev_idx, - in_block_size - 1 - i); + for (size_t block_idx = 0; block_idx < num_out_blocks; block_idx++) { + for (size_t rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { + for (size_t i = 0; i < in_block_size; i++) { + output.at
({block_idx, rev_idx, i}) = + input.at
({num_out_blocks - 1 - block_idx, + reverse_dim_size - 1 - rev_idx, + in_block_size - 1 - i}); } } } @@ -31,13 +31,13 @@ struct CPUReverseForwardKernel { void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, GenericTensorAccessorW &output_accessor) { DataTypeDispatch1{}( - input_accessor.data_type, input_accessor, std::ref(output_accessor)); + input_accessor.data_type, input_accessor, output_accessor); } void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, GenericTensorAccessorW &input_accessor) { DataTypeDispatch1{}( - output_accessor.data_type, output_accessor, std::ref(input_accessor)); + output_accessor.data_type, output_accessor, input_accessor); } } // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu index b895ffb68f..dc342fd0e0 100644 --- a/lib/kernels/src/cuda/ops/cast_kernels.cu +++ b/lib/kernels/src/cuda/ops/cast_kernels.cu @@ -60,20 +60,16 @@ struct BackwardKernel { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + input.data_type, output.data_type, stream, input, output); } void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + input.data_type, output.data_type, stream, input, output); } } // namespace Cast diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu index 29b77fd9d9..f13ebee67e 100644 --- a/lib/kernels/src/cuda/ops/linear_kernels.cu +++ b/lib/kernels/src/cuda/ops/linear_kernels.cu @@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream, batch_size, in_dim, &alpha, - (void *)weight_ptr, + reinterpret_cast(weight_ptr), weight_type, in_dim, - (void *)input_ptr, + reinterpret_cast(input_ptr), input_type, in_dim, &beta, - (void *)output_ptr, + reinterpret_cast(output_ptr), output_type, out_dim, compute_type, @@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream, batch_size, 1, &alpha, - (void *)bias_ptr, + reinterpret_cast(bias_ptr), weight_type, 1, - (void *)m.one_ptr, + reinterpret_cast(m.one_ptr), CUDA_R_32F, 1, &alpha, - (void *)output_ptr, + reinterpret_cast(output_ptr), output_type, out_dim, compute_type, @@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream, m.actiDesc, &alpha, m.outputTensor, - (void *)output_ptr, + reinterpret_cast(output_ptr), &beta, m.outputTensor, - (void *)output_ptr)); + reinterpret_cast(output_ptr))); } else if (m.activation == Activation::GELU) { size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size); constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) @@ -217,14 +217,14 @@ void backward_kernel(cudaStream_t stream, if (m.activation.has_value()) { if (m.activation == Activation::RELU) { relu_backward_kernel(m.output_type, - (void *)output_grad_ptr, - (void *)output_ptr, + reinterpret_cast(output_grad_ptr), + reinterpret_cast(output_ptr), output_size, stream); } else if (m.activation == Activation::SIGMOID) { sigmoid_backward_kernel(m.output_type, - (void *)output_grad_ptr, - (void *)output_ptr, + reinterpret_cast(output_grad_ptr), + reinterpret_cast(output_ptr), output_size, stream); } else { @@ -241,14 +241,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - (void *)input_ptr, + reinterpret_cast(input_ptr), input_type, in_dim, - (void *)output_grad_ptr, + reinterpret_cast(output_grad_ptr), output_type, out_dim, &alpha, - (void *)kernel_grad_ptr, + reinterpret_cast(kernel_grad_ptr), weight_type, in_dim, compute_type, @@ -290,14 +290,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - (void *)m.one_ptr, + reinterpret_cast(m.one_ptr), CUDA_R_32F, 1, - (void *)output_grad_ptr, + reinterpret_cast(output_grad_ptr), output_type, out_dim, &alpha, - (void *)bias_grad_ptr, + reinterpret_cast(bias_grad_ptr), weight_type, 1, compute_type, @@ -313,14 +313,14 @@ void backward_kernel(cudaStream_t stream, batch_size, out_dim, &alpha, - (void *)kernel_ptr, + reinterpret_cast(kernel_ptr), weight_type, in_dim, - (void *)output_grad_ptr, + reinterpret_cast(output_grad_ptr), output_type, out_dim, &alpha, - (void *)input_grad_ptr, + reinterpret_cast(input_grad_ptr), input_type, in_dim, compute_type, diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index ca105f9bc9..5bd49dc26f 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -3,10 +3,11 @@ namespace FlexFlow { -ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() { +ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( + size_t workSpaceSize, bool allowTensorOpMathConversion) { this->handle = new PerDeviceFFHandle; - this->handle->workSpaceSize = 1024 * 1024; - this->handle->allowTensorOpMathConversion = true; + this->handle->workSpaceSize = workSpaceSize; + this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion; checkCUDNN(cudnnCreate(&this->handle->dnn)); checkCUBLAS(cublasCreate(&this->handle->blas)); @@ -37,7 +38,6 @@ ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() { checkCUBLAS(cublasDestroy(this->handle->blas)); checkCUDA(cudaFree(this->handle->workSpace)); delete this->handle; - this->handle = nullptr; } } diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc index 1dedb0c41d..ce8a808454 100644 --- a/lib/kernels/test/src/test_managed_ff_stream.cc +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -6,24 +6,24 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Managed FF Stream") { ManagedFFStream base_stream{}; + ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); SUBCASE("Test ManagedFFStream Move Constructor") { - ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); - ManagedFFStream new_stream(std::move(base_stream)); - CHECK(&base_stream.raw_stream() == nullptr); CHECK(&new_stream.raw_stream() == base_stream_ptr); } SUBCASE("Test ManagedFFStream Assignment Operator") { - ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); - ManagedFFStream new_stream{}; new_stream = std::move(base_stream); - CHECK(&base_stream.raw_stream() == nullptr); CHECK(&new_stream.raw_stream() == base_stream_ptr); } + + SUBCASE("Test Self-Assignment") { + base_stream = std::move(base_stream); + CHECK(&base_stream.raw_stream() == base_stream_ptr); + } } } diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc index e85cfd61c7..d39da03ba9 100644 --- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -5,7 +5,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Managed Per Device FF Handle") { - ManagedPerDeviceFFHandle base_handle{}; + ManagedPerDeviceFFHandle base_handle{1024 * 1024, true}; + PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); SUBCASE("Test ManagedPerDeviceFFHandle Constructor") { CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024); @@ -13,8 +14,6 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Test ManagedPerDeviceFFHandle Move Constructor") { - PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); - ManagedPerDeviceFFHandle new_handle(std::move(base_handle)); CHECK(&base_handle.raw_handle() == nullptr); @@ -22,13 +21,16 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Test ManagedPerDeviceFFHandle Assignment Operator") { - PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); - - ManagedPerDeviceFFHandle new_handle{}; + ManagedPerDeviceFFHandle new_handle{1024 * 1024, true}; new_handle = std::move(base_handle); CHECK(&base_handle.raw_handle() == nullptr); CHECK(&new_handle.raw_handle() == base_handle_ptr); } + + SUBCASE("Test Self-Assignment") { + base_handle = std::move(base_handle); + CHECK(&base_handle.raw_handle() == base_handle_ptr); + } } } diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index ca9e9e9c11..bfed1241ba 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -108,83 +108,6 @@ struct CPUAccessorRContainsNonZero { } }; -bool contains_non_zero(GenericTensorAccessorR const &accessor) { - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorR cpu_accessor = - create_cpu_compatible_accessor_r(accessor, cpu_allocator); - return DataTypeDispatch1{}( - cpu_accessor.data_type, cpu_accessor); -} - -bool contains_non_zero(GenericTensorAccessorW const &accessor) { - GenericTensorAccessorR r_accessor = - read_only_accessor_from_write_accessor(accessor); - return contains_non_zero(r_accessor); -} - -GenericTensorAccessorR - create_cpu_compatible_accessor_r(GenericTensorAccessorR const &accessor, - Allocator &cpu_allocator) { - GenericTensorAccessorR cpu_accessor = accessor; - if (accessor.device_type == DeviceType::GPU) { - cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator); - } - return cpu_accessor; -} - -GenericTensorAccessorW - create_cpu_compatible_accessor_w(GenericTensorAccessorW const &accessor, - Allocator &cpu_allocator) { - GenericTensorAccessorW cpu_accessor = accessor; - if (accessor.device_type == DeviceType::GPU) { - cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator); - } - return cpu_accessor; -} - -template -struct PrintCPUAccessorR { - void operator()(GenericTensorAccessorR const &accessor) { - using T = real_type_t
; - - T const *data_ptr = accessor.get
(); - for (size_t i = 0; i < accessor.shape.num_elements(); i++) { - std::cout << data_ptr[i] << " "; - } - std::cout << "\n"; - } -}; - -void print_accessor(GenericTensorAccessorR const &accessor) { - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorR cpu_accessor = - create_cpu_compatible_accessor_r(accessor, cpu_allocator); - DataTypeDispatch1{}(accessor.data_type, accessor); -} - -void print_accessor(GenericTensorAccessorW const &accessor) { - GenericTensorAccessorR r_accessor = - read_only_accessor_from_write_accessor(accessor); - print_accessor(r_accessor); -} - -template -struct CPUAccessorRContainsNonZero { - bool operator()(GenericTensorAccessorR const &accessor) { - using T = real_type_t
; - - T const *data_ptr = accessor.get
(); - - for (size_t i = 0; i < accessor.shape.num_elements(); i++) { - if (data_ptr[i] != 0) { - return true; - } - } - - return false; - } -}; - bool contains_non_zero(GenericTensorAccessorR const &accessor) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorR cpu_accessor = diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc index 3e7baf49a9..e9adf88422 100644 --- a/lib/local-execution/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -54,9 +54,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { profiling, "[Cast] forward_time = {:.2lf}ms\n", input, - output, - input.data_type, - attrs.dtype); + output); } static std::optional @@ -73,9 +71,7 @@ static std::optional profiling, "[Cast] forward_time = {:.2lf}ms\n", input_grad, - output_grad, - input.data_type, - attrs.dtype); + output_grad); } TaskImplFunction get_cast_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 4637cb388e..fd2c1cd5e4 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -125,17 +125,17 @@ static std::optional auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); - auto bias = acc.get_tensor(BIAS); + auto bias = acc.get_tensor(BIAS); auto input_grad = acc.get_tensor_grad(INPUT); auto weight_grad = acc.get_tensor_grad(WEIGHT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); - float const *bias_ptr = NULL; + float *bias_ptr = NULL; if (attrs.use_bias) { bias_ptr = bias.get_float_ptr(); } @@ -149,12 +149,12 @@ static std::optional "[Linear] backward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), - (float *)input_grad.get_float_ptr(), + input_grad.get_float_ptr(), output.get_float_ptr(), - (float *)output_grad.get_float_ptr(), + output_grad.get_float_ptr(), weight.get_float_ptr(), - (float *)weight_grad.get_float_ptr(), - (float *)bias_ptr, + weight_grad.get_float_ptr(), + bias_ptr, in_dim, out_dim, batch_size); diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index da3af6e3ad..788ab52a7a 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -12,7 +12,7 @@ // TEST_SUITE(FF_CUDA_TEST_SUITE) { // TEST_CASE("Local Cost Estimator") { // // local backing initialization -// ManagedPerDeviceFFHandle managed_handle{}; +// ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); // RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ // DeviceSpecific::create(managed_handle.raw_handle()), From ddae36776dad2091f4b9aa79ef594c98258ed955 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Fri, 15 Nov 2024 17:09:37 -0800 Subject: [PATCH 18/42] code formatting and refactor --- lib/kernels/include/kernels/accessor.h | 103 +++++++++++++---- .../include/kernels/copy_tensor_accessor.h | 19 ++++ .../include/kernels/managed_ff_stream.h | 2 + .../kernels/managed_per_device_ff_handle.h | 2 + lib/kernels/src/accessor.cc | 104 +----------------- lib/kernels/src/copy_tensor_accessor.cc | 48 ++++++++ lib/kernels/src/cpu/replicate_kernels.cc | 4 +- lib/kernels/src/cpu/reverse_kernels.cc | 12 +- lib/kernels/src/cuda/ops/linear_kernels.cu | 42 +++---- lib/kernels/src/managed_ff_stream.cc | 9 +- .../src/managed_per_device_ff_handle.cc | 13 +-- .../test/src/test_managed_ff_stream.cc | 24 ++-- .../src/test_managed_per_device_ff_handle.cc | 26 +++-- .../test/src/test_local_cost_estimator.cc | 6 +- .../include/op-attrs/make_datatype_value.h | 16 +++ .../src/op-attrs/make_datatype_value.cc | 25 +++++ lib/pcg/src/pcg/computation_graph_builder.cc | 25 +++-- .../parallel_computation_graph_builder.cc | 9 +- 18 files changed, 285 insertions(+), 204 deletions(-) create mode 100644 lib/kernels/include/kernels/copy_tensor_accessor.h create mode 100644 lib/kernels/src/copy_tensor_accessor.cc create mode 100644 lib/op-attrs/include/op-attrs/make_datatype_value.h create mode 100644 lib/op-attrs/src/op-attrs/make_datatype_value.cc diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 653c8db42d..487bc1f8f0 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -11,8 +11,6 @@ namespace FlexFlow { -struct Allocator; - class GenericTensorAccessorR { public: template @@ -42,7 +40,7 @@ class GenericTensorAccessorR { bool operator!=(GenericTensorAccessorR const &) const; template - real_type_t
const &at(std::vector const &indices) const { + real_type_t
const &at(std::vector const &indices) const { if (this->device_type != DeviceType::CPU) { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } @@ -50,11 +48,31 @@ class GenericTensorAccessorR { throw mk_runtime_error(fmt::format( "Invalid access data type ({} != {})", this->data_type, DT)); } + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error(fmt::format("Number of indices ({}) does not " + "match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims())); + } using T = real_type_t
; - T const *data_ptr = static_cast(this->ptr); - size_t offset = calculate_index_offset(indices); + + int offset = 0; + int multiplier = 1; + for (int i = 0; i < this->shape.num_dims(); i++) { + if (indices.at(i) >= this->shape.at(legion_dim_t{i})) { + throw mk_runtime_error( + fmt::format("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + indices.at(i), + this->shape.at(legion_dim_t{i}))); + } + + offset += indices.at(i) * multiplier; + multiplier *= this->shape.at(legion_dim_t{i}); + } return data_ptr[offset]; } @@ -71,8 +89,6 @@ class GenericTensorAccessorR { decltype(ptr) const &, decltype(device_type) const &> tie() const; - - size_t calculate_index_offset(std::vector const &indices) const; }; std::string format_as(GenericTensorAccessorR const &); @@ -109,7 +125,7 @@ class GenericTensorAccessorW { operator GenericTensorAccessorR() const; template - real_type_t
&at(std::vector const &indices) { + real_type_t
&at(std::vector const &indices) { if (this->device_type != DeviceType::CPU) { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } @@ -117,17 +133,37 @@ class GenericTensorAccessorW { throw mk_runtime_error(fmt::format( "Invalid access data type ({} != {})", this->data_type, DT)); } + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error(fmt::format("Number of indices ({}) does not " + "match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims())); + } using T = real_type_t
; T *data_ptr = static_cast(this->ptr); - size_t offset = calculate_index_offset(indices); + int offset = 0; + int multiplier = 1; + for (int i = 0; i < this->shape.num_dims(); i++) { + if (indices.at(i) >= this->shape.at(legion_dim_t{i})) { + throw mk_runtime_error( + fmt::format("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + indices.at(i), + this->shape.at(legion_dim_t{i}))); + } + + offset += indices.at(i) * multiplier; + multiplier *= this->shape.at(legion_dim_t{i}); + } return data_ptr[offset]; } template - real_type_t
&at(std::vector const &indices) const { + real_type_t
&at(std::vector const &indices) const { if (this->device_type != DeviceType::CPU) { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } @@ -135,11 +171,31 @@ class GenericTensorAccessorW { throw mk_runtime_error(fmt::format( "Invalid access data type ({} != {})", this->data_type, DT)); } + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error(fmt::format("Number of indices ({}) does not " + "match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims())); + } using T = real_type_t
; T const *data_ptr = static_cast(this->ptr); - size_t offset = calculate_index_offset(indices); + int offset = 0; + int multiplier = 1; + for (int i = 0; i < this->shape.num_dims(); i++) { + if (indices.at(i) >= this->shape.at(legion_dim_t{i})) { + throw mk_runtime_error( + fmt::format("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + indices.at(i), + this->shape.at(legion_dim_t{i}))); + } + + offset += indices.at(i) * multiplier; + multiplier *= this->shape.at(legion_dim_t{i}); + } return data_ptr[offset]; } @@ -156,8 +212,6 @@ class GenericTensorAccessorW { decltype(ptr) const &, decltype(device_type) const &> tie() const; - - size_t calculate_index_offset(std::vector const &indices) const; }; std::string format_as(GenericTensorAccessorW const &); @@ -213,6 +267,21 @@ std::vector std::vector get_half_ptrs(std::vector const &); +int32_t *get_int32_ptr(GenericTensorAccessorW const &); +int64_t *get_int64_ptr(GenericTensorAccessorW const &); +float *get_float_ptr(GenericTensorAccessorW const &); +double *get_double_ptr(GenericTensorAccessorW const &); +half *get_half_ptr(GenericTensorAccessorW const &); +std::vector + get_int32_ptrs(std::vector const &); +std::vector + get_int64_ptrs(std::vector const &); +std::vector + get_float_ptrs(std::vector const &); +std::vector + get_double_ptrs(std::vector const &); +std::vector get_half_ptrs(std::vector const &); + template std::vector const *> get(std::vector const &accs) { @@ -239,14 +308,6 @@ std::pair void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor, GenericTensorAccessorR const &src_accessor); -GenericTensorAccessorR - copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, - Allocator &allocator); - -GenericTensorAccessorW - copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, - Allocator &allocator); - } // namespace FlexFlow namespace FlexFlow { diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h new file mode 100644 index 0000000000..da8af71e4f --- /dev/null +++ b/lib/kernels/include/kernels/copy_tensor_accessor.h @@ -0,0 +1,19 @@ +#ifndef _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H +#define _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H + +#include "kernels/accessor.h" +#include "kernels/allocation.h" + +namespace FlexFlow { + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator); + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/managed_ff_stream.h b/lib/kernels/include/kernels/managed_ff_stream.h index 2f690b2eb3..26d5fb4911 100644 --- a/lib/kernels/include/kernels/managed_ff_stream.h +++ b/lib/kernels/include/kernels/managed_ff_stream.h @@ -19,6 +19,8 @@ struct ManagedFFStream { ffStream_t const &raw_stream() const; + void cleanup(); + private: ffStream_t *stream; }; diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index f9f944c6ff..035ea574de 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -24,6 +24,8 @@ struct ManagedPerDeviceFFHandle { PerDeviceFFHandle const &raw_handle() const; + void cleanup(); + private: PerDeviceFFHandle *handle; }; diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index 4cb5bd83a2..e56bded737 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -26,7 +26,7 @@ void copy_accessor_data_to_l_from_r( dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost)); } else { assert(src_device_type == DeviceType::GPU); - assert(src_device_type == DeviceType::CPU); + assert(dst_device_type == DeviceType::GPU); checkCUDA(cudaMemcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes, @@ -53,36 +53,6 @@ std::tupledata_type, this->shape, this->ptr, this->device_type); } -size_t GenericTensorAccessorW::calculate_index_offset( - std::vector const &indices) const { - - if (indices.size() != this->shape.num_dims()) { - throw mk_runtime_error(fmt::format( - "Number of indices ({}) does not match the number of dimensions ({}).", - indices.size(), - this->shape.num_dims())); - } - - size_t offset = 0; - size_t multiplier = 1; - - for (size_t i = 0; i < this->shape.num_dims(); i++) { - if (indices[i] >= this->shape.at(legion_dim_t(i))) { - throw mk_runtime_error( - fmt::format("In {} dimension, attempting to access index {} " - "when only {} indexes exist", - i, - indices[i], - this->shape.at(legion_dim_t(i)))); - } - - offset += indices[i] * multiplier; - multiplier *= this->shape.at(legion_dim_t(i)); - } - - return offset; -} - bool GenericTensorAccessorW::operator==( GenericTensorAccessorW const &other) const { return this->tie() == other.tie(); @@ -139,36 +109,6 @@ std::tupledata_type, this->shape, this->ptr, this->device_type); } -size_t GenericTensorAccessorR::calculate_index_offset( - std::vector const &indices) const { - - if (indices.size() != this->shape.num_dims()) { - throw mk_runtime_error(fmt::format( - "Number of indices ({}) does not match the number of dimensions ({}).", - indices.size(), - this->shape.num_dims())); - } - - ssize_t offset = 0; - size_t multiplier = 1; - - for (size_t i = 0; i < this->shape.num_dims(); i++) { - if (indices[i] >= this->shape.at(legion_dim_t(i))) { - throw mk_runtime_error( - fmt::format("In {} dimension, attempting to access index {} " - "when only {} indexes exist", - i, - indices[i], - this->shape.at(legion_dim_t(i)))); - } - - offset += indices[i] * multiplier; - multiplier *= this->shape.at(legion_dim_t(i)); - } - - return offset; -} - bool GenericTensorAccessorR::operator==( GenericTensorAccessorR const &other) const { return this->tie() == other.tie(); @@ -280,46 +220,4 @@ std::pair return std::make_pair(accessor.shape, accessor.data_type); } -template -struct CopyTensorAccessorW { - GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, - Allocator &allocator) { - TensorShape shape = - get_tensor_shape(src_accessor.shape, src_accessor.data_type); - GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); - - copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); - - return dst_accessor; - } -}; - -GenericTensorAccessorW - copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, - Allocator &allocator) { - return DataTypeDispatch1{}( - src_accessor.data_type, src_accessor, allocator); -} - -template -struct CopyTensorAccessorR { - GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor, - Allocator &allocator) { - TensorShape shape = - get_tensor_shape(src_accessor.shape, src_accessor.data_type); - GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); - - copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); - - return read_only_accessor_from_write_accessor(dst_accessor); - } -}; - -GenericTensorAccessorR - copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, - Allocator &allocator) { - return DataTypeDispatch1{}( - src_accessor.data_type, src_accessor, allocator); -} - } // namespace FlexFlow diff --git a/lib/kernels/src/copy_tensor_accessor.cc b/lib/kernels/src/copy_tensor_accessor.cc new file mode 100644 index 0000000000..6a3ad8033a --- /dev/null +++ b/lib/kernels/src/copy_tensor_accessor.cc @@ -0,0 +1,48 @@ +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow { + +template +struct CopyTensorAccessorW { + GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return dst_accessor; + } +}; + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, allocator); +} + +template +struct CopyTensorAccessorR { + GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return read_only_accessor_from_write_accessor(dst_accessor); + } +}; + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, allocator); +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index 25693b374d..cfcb44dac5 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -19,9 +19,9 @@ struct CPUBackwardKernel { GenericTensorAccessorW &input, size_t num_replicas) { using T = real_type_t
; - for (size_t i = 0; i < input.shape.num_elements(); i++) { + for (int i = 0; i < input.shape.num_elements(); i++) { T cur_sum = 0; - for (size_t j = 0; j < num_replicas; j++) { + for (int j = 0; j < num_replicas; j++) { cur_sum += output.at
({i, j}); } input.at
({i}) = cur_sum; diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index e5b3719d74..bc73c80e9e 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -11,13 +11,13 @@ struct CPUReverseForwardKernel { GenericTensorAccessorW &output) { assert(input.data_type == DT && output.data_type == DT); - size_t num_out_blocks = input.shape.at(legion_dim_t(0)); - size_t reverse_dim_size = input.shape.at(legion_dim_t(1)); - size_t in_block_size = input.shape.at(legion_dim_t(2)); + int num_out_blocks = input.shape.at(legion_dim_t(0)); + int reverse_dim_size = input.shape.at(legion_dim_t(1)); + int in_block_size = input.shape.at(legion_dim_t(2)); - for (size_t block_idx = 0; block_idx < num_out_blocks; block_idx++) { - for (size_t rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { - for (size_t i = 0; i < in_block_size; i++) { + for (int block_idx = 0; block_idx < num_out_blocks; block_idx++) { + for (int rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { + for (int i = 0; i < in_block_size; i++) { output.at
({block_idx, rev_idx, i}) = input.at
({num_out_blocks - 1 - block_idx, reverse_dim_size - 1 - rev_idx, diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu index f13ebee67e..6b069218fa 100644 --- a/lib/kernels/src/cuda/ops/linear_kernels.cu +++ b/lib/kernels/src/cuda/ops/linear_kernels.cu @@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream, batch_size, in_dim, &alpha, - reinterpret_cast(weight_ptr), + static_cast(weight_ptr), weight_type, in_dim, - reinterpret_cast(input_ptr), + static_cast(input_ptr), input_type, in_dim, &beta, - reinterpret_cast(output_ptr), + static_cast(output_ptr), output_type, out_dim, compute_type, @@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream, batch_size, 1, &alpha, - reinterpret_cast(bias_ptr), + static_cast(bias_ptr), weight_type, 1, - reinterpret_cast(m.one_ptr), + static_cast(m.one_ptr), CUDA_R_32F, 1, &alpha, - reinterpret_cast(output_ptr), + static_cast(output_ptr), output_type, out_dim, compute_type, @@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream, m.actiDesc, &alpha, m.outputTensor, - reinterpret_cast(output_ptr), + static_cast(output_ptr), &beta, m.outputTensor, - reinterpret_cast(output_ptr))); + static_cast(output_ptr))); } else if (m.activation == Activation::GELU) { size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size); constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) @@ -217,14 +217,14 @@ void backward_kernel(cudaStream_t stream, if (m.activation.has_value()) { if (m.activation == Activation::RELU) { relu_backward_kernel(m.output_type, - reinterpret_cast(output_grad_ptr), - reinterpret_cast(output_ptr), + static_cast(output_grad_ptr), + static_cast(output_ptr), output_size, stream); } else if (m.activation == Activation::SIGMOID) { sigmoid_backward_kernel(m.output_type, - reinterpret_cast(output_grad_ptr), - reinterpret_cast(output_ptr), + static_cast(output_grad_ptr), + static_cast(output_ptr), output_size, stream); } else { @@ -241,14 +241,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - reinterpret_cast(input_ptr), + static_cast(input_ptr), input_type, in_dim, - reinterpret_cast(output_grad_ptr), + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - reinterpret_cast(kernel_grad_ptr), + static_cast(kernel_grad_ptr), weight_type, in_dim, compute_type, @@ -290,14 +290,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - reinterpret_cast(m.one_ptr), + static_cast(m.one_ptr), CUDA_R_32F, 1, - reinterpret_cast(output_grad_ptr), + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - reinterpret_cast(bias_grad_ptr), + static_cast(bias_grad_ptr), weight_type, 1, compute_type, @@ -313,14 +313,14 @@ void backward_kernel(cudaStream_t stream, batch_size, out_dim, &alpha, - reinterpret_cast(kernel_ptr), + static_cast(kernel_ptr), weight_type, in_dim, - reinterpret_cast(output_grad_ptr), + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - reinterpret_cast(input_grad_ptr), + static_cast(input_grad_ptr), input_type, in_dim, compute_type, diff --git a/lib/kernels/src/managed_ff_stream.cc b/lib/kernels/src/managed_ff_stream.cc index a8b44dc1d3..f0348aa91c 100644 --- a/lib/kernels/src/managed_ff_stream.cc +++ b/lib/kernels/src/managed_ff_stream.cc @@ -12,16 +12,17 @@ ManagedFFStream::ManagedFFStream(ManagedFFStream &&other) noexcept ManagedFFStream &ManagedFFStream::operator=(ManagedFFStream &&other) noexcept { if (this != &other) { - if (this->stream != nullptr) { - checkCUDA(cudaStreamDestroy(*this->stream)); - delete stream; - } + this->cleanup(); this->stream = std::exchange(other.stream, nullptr); } return *this; } ManagedFFStream::~ManagedFFStream() { + this->cleanup(); +} + +void ManagedFFStream::cleanup() { if (this->stream != nullptr) { checkCUDA(cudaStreamDestroy(*this->stream)); delete this->stream; diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index 5bd49dc26f..9f1737240e 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -5,7 +5,7 @@ namespace FlexFlow { ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( size_t workSpaceSize, bool allowTensorOpMathConversion) { - this->handle = new PerDeviceFFHandle; + this->handle = new PerDeviceFFHandle{}; this->handle->workSpaceSize = workSpaceSize; this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion; @@ -21,18 +21,17 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( ManagedPerDeviceFFHandle &ManagedPerDeviceFFHandle::operator=( ManagedPerDeviceFFHandle &&other) noexcept { if (this != &other) { - if (this->handle != nullptr) { - checkCUDNN(cudnnDestroy(this->handle->dnn)); - checkCUBLAS(cublasDestroy(this->handle->blas)); - checkCUDA(cudaFree(this->handle->workSpace)); - delete this->handle; - } + this->cleanup(); this->handle = std::exchange(other.handle, nullptr); } return *this; } ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() { + this->cleanup(); +} + +void ManagedPerDeviceFFHandle::cleanup() { if (this->handle != nullptr) { checkCUDNN(cudnnDestroy(this->handle->dnn)); checkCUBLAS(cublasDestroy(this->handle->blas)); diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc index ce8a808454..605aa6ffa1 100644 --- a/lib/kernels/test/src/test_managed_ff_stream.cc +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -4,26 +4,28 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test Managed FF Stream") { + TEST_CASE("ManagedFFStream") { ManagedFFStream base_stream{}; ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); - SUBCASE("Test ManagedFFStream Move Constructor") { + SUBCASE("move constructor") { ManagedFFStream new_stream(std::move(base_stream)); CHECK(&base_stream.raw_stream() == nullptr); CHECK(&new_stream.raw_stream() == base_stream_ptr); } - SUBCASE("Test ManagedFFStream Assignment Operator") { - ManagedFFStream new_stream{}; - new_stream = std::move(base_stream); - CHECK(&base_stream.raw_stream() == nullptr); - CHECK(&new_stream.raw_stream() == base_stream_ptr); - } + SUBCASE("move assignment operator") { + SUBCASE("move assign to other") { + ManagedFFStream new_stream{}; + new_stream = std::move(base_stream); + CHECK(&base_stream.raw_stream() == nullptr); + CHECK(&new_stream.raw_stream() == base_stream_ptr); + } - SUBCASE("Test Self-Assignment") { - base_stream = std::move(base_stream); - CHECK(&base_stream.raw_stream() == base_stream_ptr); + SUBCASE("move assign to self") { + base_stream = std::move(base_stream); + CHECK(&base_stream.raw_stream() == base_stream_ptr); + } } } } diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc index d39da03ba9..de3e5b72b1 100644 --- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -4,33 +4,35 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test Managed Per Device FF Handle") { + TEST_CASE("ManagedPerDeviceFFHandle") { ManagedPerDeviceFFHandle base_handle{1024 * 1024, true}; PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); - SUBCASE("Test ManagedPerDeviceFFHandle Constructor") { + SUBCASE("constructor") { CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024); CHECK(base_handle.raw_handle().allowTensorOpMathConversion == true); } - SUBCASE("Test ManagedPerDeviceFFHandle Move Constructor") { + SUBCASE("move constructor") { ManagedPerDeviceFFHandle new_handle(std::move(base_handle)); CHECK(&base_handle.raw_handle() == nullptr); CHECK(&new_handle.raw_handle() == base_handle_ptr); } - SUBCASE("Test ManagedPerDeviceFFHandle Assignment Operator") { - ManagedPerDeviceFFHandle new_handle{1024 * 1024, true}; - new_handle = std::move(base_handle); + SUBCASE("move assignment operator") { + SUBCASE("move assign to other") { + ManagedPerDeviceFFHandle new_handle{1024 * 1024, true}; + new_handle = std::move(base_handle); - CHECK(&base_handle.raw_handle() == nullptr); - CHECK(&new_handle.raw_handle() == base_handle_ptr); - } + CHECK(&base_handle.raw_handle() == nullptr); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } - SUBCASE("Test Self-Assignment") { - base_handle = std::move(base_handle); - CHECK(&base_handle.raw_handle() == base_handle_ptr); + SUBCASE("move assign to self") { + base_handle = std::move(base_handle); + CHECK(&base_handle.raw_handle() == base_handle_ptr); + } } } } diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index 788ab52a7a..512c1ef33b 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -12,7 +12,11 @@ // TEST_SUITE(FF_CUDA_TEST_SUITE) { // TEST_CASE("Local Cost Estimator") { // // local backing initialization -// ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); +// ManagedPerDeviceFFHandle managed_handle{ +/*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true +} +; // RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ // DeviceSpecific::create(managed_handle.raw_handle()), diff --git a/lib/op-attrs/include/op-attrs/make_datatype_value.h b/lib/op-attrs/include/op-attrs/make_datatype_value.h new file mode 100644 index 0000000000..c3289c6309 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/make_datatype_value.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H + +#include "op-attrs/datatype_value.dtg.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value); +DataTypeValue make_double_data_type_value(double value); +DataTypeValue make_int32_data_type_value(int32_t value); +DataTypeValue make_int64_data_type_value(int64_t value); +DataTypeValue make_bool_data_type_value(bool value); + +} + +#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H diff --git a/lib/op-attrs/src/op-attrs/make_datatype_value.cc b/lib/op-attrs/src/op-attrs/make_datatype_value.cc new file mode 100644 index 0000000000..bc402c433c --- /dev/null +++ b/lib/op-attrs/src/op-attrs/make_datatype_value.cc @@ -0,0 +1,25 @@ +#include "op-attrs/make_datatype_value.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value) { + return DataTypeValue{value}; +} + +DataTypeValue make_double_data_type_value(double value) { + return DataTypeValue{value}; +} + +DataTypeValue make_int32_data_type_value(int32_t value) { + return DataTypeValue{value}; +} + +DataTypeValue make_int64_data_type_value(int64_t value) { + return DataTypeValue{value}; +} + +DataTypeValue make_bool_data_type_value(bool value) { + return DataTypeValue{value}; +} + +} diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc index 2d523c78ac..7ff5bec2f7 100644 --- a/lib/pcg/src/pcg/computation_graph_builder.cc +++ b/lib/pcg/src/pcg/computation_graph_builder.cc @@ -3,6 +3,7 @@ #include "op-attrs/get_incoming_tensor_roles.h" #include "op-attrs/get_op_type.h" #include "op-attrs/get_output_shapes.h" +#include "op-attrs/make_datatype_value.h" #include "op-attrs/ops/attention.h" #include "op-attrs/ops/batch_norm.h" #include "op-attrs/ops/broadcast.h" @@ -613,14 +614,14 @@ tensor_guid_t ComputationGraphBuilder::batch_norm( TensorShape gamma_shape = throw_if_unexpected(get_gamma_weights_shape(attrs, input_shape)); - InitializerAttrs gamma_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}}; + InitializerAttrs gamma_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(1)}}; weights.push_back(make_weight_attrs(gamma_shape, gamma_initializer)); TensorShape beta_shape = throw_if_unexpected(get_beta_weights_shape(attrs, input_shape)); - InitializerAttrs beta_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}}; + InitializerAttrs beta_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(0)}}; weights.push_back(make_weight_attrs(beta_shape, beta_initializer)); } @@ -692,8 +693,8 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention( get_input_bias_shape(attrs, query_shape, key_shape, value_shape)); // initializer chosen based on // https://github.com/pytorch/pytorch/blob/31c4e0d37d8efc37a0697159e5b9121ec34d5141/torch/nn/modules/activation.py#L1120-L1121 - InitializerAttrs input_bias_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}}; + InitializerAttrs input_bias_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(0)}}; weights.push_back( make_weight_attrs(input_bias_shape, input_bias_initializer)); @@ -702,8 +703,8 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention( get_output_bias_shape(attrs, query_shape, key_shape, value_shape)); // initializer chosen based on // https://github.com/pytorch/pytorch/blob/31c4e0d37d8efc37a0697159e5b9121ec34d5141/torch/nn/modules/activation.py#L1120-L1121 - InitializerAttrs output_bias_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}}; + InitializerAttrs output_bias_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(0)}}; weights.push_back( make_weight_attrs(output_bias_shape, output_bias_initializer)); @@ -888,14 +889,14 @@ tensor_guid_t ComputationGraphBuilder::layer_norm( TensorShape gamma_shape = throw_if_unexpected(get_gamma_weights_shape(attrs, input_shape)); - InitializerAttrs gamma_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}}; + InitializerAttrs gamma_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(1)}}; weights.push_back(make_weight_attrs(gamma_shape, gamma_initializer)); TensorShape beta_shape = throw_if_unexpected(get_beta_weights_shape(attrs, input_shape)); - InitializerAttrs beta_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}}; + InitializerAttrs beta_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(0)}}; weights.push_back(make_weight_attrs(beta_shape, beta_initializer)); } diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc index f33b4dcd17..79ac43ae66 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc @@ -1,5 +1,6 @@ #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" #include "op-attrs/get_incoming_tensor_roles.h" +#include "op-attrs/make_datatype_value.h" #include "op-attrs/ops/attention.h" #include "op-attrs/ops/batch_matmul.h" #include "op-attrs/ops/batch_norm.h" @@ -385,14 +386,14 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::batch_norm( ParallelTensorShape gamma_shape = throw_if_unexpected(get_gamma_weights_shape(attrs, input_shape)); - InitializerAttrs gamma_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}}; + InitializerAttrs gamma_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(1)}}; weights.push_back(make_weight_attrs(gamma_shape, gamma_initializer)); ParallelTensorShape beta_shape = throw_if_unexpected(get_beta_weights_shape(attrs, input_shape)); - InitializerAttrs beta_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}}; + InitializerAttrs beta_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(0)}}; weights.push_back(make_weight_attrs(beta_shape, beta_initializer)); } From 507df4a30d5b20eef90fc2f8ed75e2707a0e2b6c Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Thu, 21 Nov 2024 22:16:51 -0800 Subject: [PATCH 19/42] issue #1502 & issue #1540 --- lib/kernels/CMakeLists.txt | 3 +- .../include/kernels/batch_norm_kernels.h | 4 +- lib/kernels/include/kernels/cast_kernels.h | 4 +- .../include/kernels/cast_kernels_cpu.h | 4 +- lib/kernels/include/kernels/conv_2d_kernels.h | 4 +- .../include/kernels/element_unary_kernels.h | 6 +- .../include/kernels/embedding_kernels.h | 4 +- lib/kernels/include/kernels/flat_kernels.h | 7 +- lib/kernels/include/kernels/linear_kernels.h | 4 +- .../include/kernels/loss_function_kernels.h | 2 +- lib/kernels/include/kernels/metrics_kernels.h | 29 +- .../include/kernels/optimizer_kernels.h | 124 ++-- .../include/kernels/partition_kernels.h | 4 +- .../kernels}/per_device_op_state.variant.toml | 0 lib/kernels/include/kernels/pool_2d_kernels.h | 9 +- .../include/kernels/reduction_kernels.h | 4 +- lib/kernels/include/kernels/reshape_kernels.h | 4 +- lib/kernels/include/kernels/softmax_kernels.h | 2 +- .../include/kernels/transpose_kernels.h | 4 +- lib/kernels/src/cpu/cast_kernels.cc | 14 +- lib/kernels/src/cuda/cuda_helper.cu | 12 +- lib/kernels/src/cuda/embedding_kernels.cu | 549 ++++++++++++++---- lib/kernels/src/cuda/metrics_functions.cu | 101 ++-- .../src/cuda/ops/batch_norm_kernels.cu | 4 +- lib/kernels/src/cuda/ops/cast_kernels.cu | 14 +- lib/kernels/src/cuda/ops/conv_2d_kernels.cu | 4 +- .../src/cuda/ops/element_unary_kernels.cu | 18 +- lib/kernels/src/cuda/ops/flat_kernels.cu | 4 +- lib/kernels/src/cuda/ops/linear_kernels.cu | 4 +- lib/kernels/src/cuda/ops/partition_kernels.cu | 10 +- lib/kernels/src/cuda/ops/pool_2d_kernels.cu | 6 +- lib/kernels/src/cuda/ops/reduction_kernels.cu | 10 +- lib/kernels/src/cuda/ops/reshape_kernels.cu | 10 +- lib/kernels/src/cuda/ops/softmax_kernels.cu | 2 +- lib/kernels/src/cuda/ops/transpose_kernels.cu | 4 +- ...timizer_kernel.cu => optimizer_kernels.cu} | 57 +- .../local-execution/per_device_op_state.h | 2 +- .../local-execution/task_argument_accessor.h | 2 +- lib/local-execution/src/ops/batch_norm.cc | 4 +- lib/local-execution/src/ops/conv_2d.cc | 6 +- lib/local-execution/src/ops/element_unary.cc | 10 +- lib/local-execution/src/ops/flat.cc | 6 +- lib/local-execution/src/ops/linear.cc | 4 +- lib/local-execution/src/ops/pool_2d.cc | 10 +- lib/local-execution/src/ops/reduction.cc | 6 +- lib/local-execution/src/ops/repartition.cc | 4 +- lib/local-execution/src/ops/reshape.cc | 4 +- lib/local-execution/src/ops/softmax.cc | 2 +- lib/local-execution/src/ops/transpose.cc | 4 +- ...device_state.cc => per_device_op_state.cc} | 0 .../include/op-attrs/aggregate_op.enum.toml | 5 +- .../include/op-attrs/datatype_value.h | 16 + .../include/op-attrs/make_datatype_value.h | 2 +- .../src/op-attrs/make_datatype_value.cc | 10 +- lib/pcg/include/pcg/metric.h | 73 +++ lib/pcg/src/pcg/metric.cc | 38 ++ lib/runtime/src/metrics_functions.cc | 33 -- lib/runtime/src/metrics_functions.h | 63 +- lib/runtime/src/ops/embedding.cc | 4 +- 59 files changed, 917 insertions(+), 436 deletions(-) rename lib/{local-execution/include/local-execution => kernels/include/kernels}/per_device_op_state.variant.toml (100%) rename lib/kernels/src/cuda/{optimizer_kernel.cu => optimizer_kernels.cu} (80%) rename lib/local-execution/src/{per_device_state.cc => per_device_op_state.cc} (100%) create mode 100644 lib/op-attrs/include/op-attrs/datatype_value.h create mode 100644 lib/pcg/include/pcg/metric.h create mode 100644 lib/pcg/src/pcg/metric.cc diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index fc91b7d3db..f5d88f102f 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -7,8 +7,7 @@ file(GLOB_RECURSE SRC CONFIGURE_DEPENDS LIST_DIRECTORIES False src/*.cc - src/cuda/cuda_helper.cu - src/cuda/ops/*.cu + src/cuda/*.cu ) add_library( diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h index 4de6ac6af0..3fea92c86b 100644 --- a/lib/kernels/include/kernels/batch_norm_kernels.h +++ b/lib/kernels/include/kernels/batch_norm_kernels.h @@ -63,9 +63,9 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, BatchNormPerDeviceState const &m, - float const *input_ptr, - float *output_grad_ptr, float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, float *input_grad_ptr, float const *scale_ptr, float *scale_grad_ptr, diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h index 21e76fed1d..da13e0036d 100644 --- a/lib/kernels/include/kernels/cast_kernels.h +++ b/lib/kernels/include/kernels/cast_kernels.h @@ -11,8 +11,8 @@ void forward_kernel(ffStream_t stream, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h index 275476b4e6..a5df80d4da 100644 --- a/lib/kernels/include/kernels/cast_kernels_cpu.h +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -9,8 +9,8 @@ namespace FlexFlow::Kernels::Cast { void cpu_forward_kernel(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); -void cpu_backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index 217751e191..f49c8f50f4 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -60,10 +60,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *filter_ptr, float *filter_grad_ptr, float *bias_grad_ptr, diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h index 26ce4ecaec..c338f465ac 100644 --- a/lib/kernels/include/kernels/element_unary_kernels.h +++ b/lib/kernels/include/kernels/element_unary_kernels.h @@ -36,10 +36,10 @@ void backward_kernel(ffStream_t stream, ElementUnaryPerDeviceState const &device_state, ElementUnaryAttrs const &attrs, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad); } // namespace Kernels::ElementUnary } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h index 6d5141f489..f5b2561b56 100644 --- a/lib/kernels/include/kernels/embedding_kernels.h +++ b/lib/kernels/include/kernels/embedding_kernels.h @@ -17,11 +17,11 @@ void forward_kernel(ffStream_t stream, int out_dim, int batch_size); void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, - DataType input_data_type, DataType output_data_type, + DataType input_data_type, std::optional aggr, int in_dim, int out_dim, diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h index 41b411c937..d60a1a5157 100644 --- a/lib/kernels/include/kernels/flat_kernels.h +++ b/lib/kernels/include/kernels/flat_kernels.h @@ -9,10 +9,11 @@ namespace FlexFlow::Kernels::Flat { void forward_kernel(ffStream_t stream, GenericTensorAccessorR input, float *output_ptr); -void backward_kernel(ffStream_t stream, + +void backward_kernel(cudaStream_t stream, GenericTensorAccessorR input, - float *input_grad_ptr, - float const *output_grad_ptr); + float const *output_grad_ptr, + float *input_grad_ptr); } // namespace FlexFlow::Kernels::Flat diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h index cff6563629..cd581b0a25 100644 --- a/lib/kernels/include/kernels/linear_kernels.h +++ b/lib/kernels/include/kernels/linear_kernels.h @@ -60,10 +60,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *kernel_ptr, float *kernel_grad_ptr, float *bias_ptr, diff --git a/lib/kernels/include/kernels/loss_function_kernels.h b/lib/kernels/include/kernels/loss_function_kernels.h index bab404f884..9e0dbd4ba1 100644 --- a/lib/kernels/include/kernels/loss_function_kernels.h +++ b/lib/kernels/include/kernels/loss_function_kernels.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H -#include "kernels/device.h" +#include "device.h" namespace FlexFlow { diff --git a/lib/kernels/include/kernels/metrics_kernels.h b/lib/kernels/include/kernels/metrics_kernels.h index e4660808b9..d961ee7503 100644 --- a/lib/kernels/include/kernels/metrics_kernels.h +++ b/lib/kernels/include/kernels/metrics_kernels.h @@ -1,25 +1,24 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H -#include "perf_metrics.h" +#include "kernels/perf_metrics.h" +#include "pcg/metric.h" namespace FlexFlow { -void update_metrics_sparse_label_kernel(ffStream_t, - MetricsAttrs const &, - float const *logit_ptr, - int const *label_ptr, - int num_samples, - int num_classes, - PerfMetrics &perf_zc); -void update_metrics_label_kernel(ffStream_t, - MetricsAttrs const &, - float const *logit_ptr, - float const *label_ptr, - int num_samples, - int num_classes, - PerfMetrics &perf_zc); +void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, + int const *label_ptr, + MetricsAttrs const *me, + int num_effective_samples, + int num_classes, + PerfMetrics &perf_zc); +void update_metrics_label_kernel_wrapper(float const *logit_ptr, + float const *label_ptr, + MetricsAttrs const *me, + int num_samples, + int num_classes, + PerfMetrics &perf_zc); } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h index 9ca6bf8e2b..3b5d292a5f 100644 --- a/lib/kernels/include/kernels/optimizer_kernels.h +++ b/lib/kernels/include/kernels/optimizer_kernels.h @@ -2,53 +2,91 @@ #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H #include "device.h" +#include "kernels/ff_handle.h" +#include "kernels/nccl.h" +#include "kernels/per_device_op_state.dtg.h" namespace FlexFlow { -void sgd_ps_update_task_gpu(ffStream_t, - float lr, - float momentum, - bool nesterov, +__global__ void sgd_update(size_t count, + float lr, + float weight_decay, + float momentum, + bool nesterov, + float const *WGrad, + float *V, + float *W); + +class SGDOptimizer { +public: + static __host__ void ps_update_task_gpu(SGDOptimizer const *op, + float const *w_grad_ptr, + size_t size, + int num_replicas, + float *w_ptr, + float *v_ptr); + +#ifdef FF_USE_NCCL + static __host__ void nccl_update_task_gpu(SGDOptimizer const *op, + PerDeviceOpState const *meta, + float const *w_grad_ptr, + size_t size, + float *w_ptr, + float *v_ptr); +#endif + +public: + float lr; + float weight_decay; + float momentum; + bool nesterov; +}; + +__global__ void + add_kernel(int count, float scale, float const *src, float *dst); + +__global__ void scale_kernel(int count, float a, float b, float *ptr); + +__global__ void adam_update(int count, + float alpha_t, + float beta1, + float beta2, float weight_decay, - float const *weight_grad_ptr, - size_t size, - int num_replicas, - float *weight_ptr, - float *sgd_v_ptr); - -void sgd_nccl_update_task_gpu(ffStream_t, - float lr, - float momentum, - bool nesterov, - float weight_decay PerDeviceFFHandle const &, - float const *weight_grad_ptr, - size_t size, - float *weight_ptr, - float *sgd_v_ptr); - -void adam_ps_update_task_gpu(ffStream_t, - float alpha_t, - float beta1, - float beta2, - float weight_decay, - float epsilon, - float const *weight_grad_ptr, - float *adam_m_ptr, - float *adam_v_ptr, - float *weight_ptr); - -void adam_nccl_update_task_gpu(ffStream_t, - float alpha_t, - float beta1, - float beta2, - float weight_decay, - float epsilon, - PerDeviceFFHandle const &, - float const *weight_grad_ptr, - float *adam_m_ptr, - float *adam_v_ptr, - float *weight_ptr); + float epsilon, + float const *WGrad, + float *M, + float *V, + float *W); -} // namespace FlexFlow +class AdamOptimizer { +public: + static __host__ void ps_update_task_gpu(AdamOptimizer const *op, + float const *w_grad_ptr, + size_t size, + int num_replicas, + float *w_ptr, + float *v_ptr, + float *m_ptr); +#ifdef FF_USE_NCCL + static __host__ void nccl_update_task_gpu(AdamOptimizer const *op, + PerDeviceOpState const *meta, + float const *w_grad_ptr, + size_t size, + float *w_ptr, + float *v_ptr, + float *m_ptr); #endif + +public: + float alpha; + float alpha_t; + float beta1; + float beta2; + float weight_decay; + float epsilon; +}; + +} // namespace FlexFlow + +#endif // _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h index e580c4a9de..9a303952d0 100644 --- a/lib/kernels/include/kernels/partition_kernels.h +++ b/lib/kernels/include/kernels/partition_kernels.h @@ -25,8 +25,8 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, RepartitionPerDeviceState const &m, - GenericTensorAccessorW const &output_grad, - GenericTensorAccessorR const &input_grad); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); } // namespace Kernels::Repartition } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml similarity index 100% rename from lib/local-execution/include/local-execution/per_device_op_state.variant.toml rename to lib/kernels/include/kernels/per_device_op_state.variant.toml diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h index 191c23bc98..c0e57e2c9a 100644 --- a/lib/kernels/include/kernels/pool_2d_kernels.h +++ b/lib/kernels/include/kernels/pool_2d_kernels.h @@ -67,12 +67,13 @@ void forward_kernel(ffStream_t stream, void const *input_ptr, void *output_ptr); -void backward_kernel(ffStream_t stream, +void backward_kernel(cudaStream_t stream, Pool2DPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, void const *output_ptr, - void const *output_grad_ptr); + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr); + } // namespace Kernels::Pool2D } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h index 7e1e240ea4..12553edd5e 100644 --- a/lib/kernels/include/kernels/reduction_kernels.h +++ b/lib/kernels/include/kernels/reduction_kernels.h @@ -12,8 +12,8 @@ void forward_kernel(ffStream_t stream, size_t num_replicas); void backward_kernel(ffStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); } // namespace FlexFlow::Kernels::Reduction diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index 5fa4382c43..6e19a9d251 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -24,8 +24,8 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, ReshapePerDeviceState const &per_device_state, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); } // namespace Kernels::Reshape } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h index 93135cb648..520ea61b64 100644 --- a/lib/kernels/include/kernels/softmax_kernels.h +++ b/lib/kernels/include/kernels/softmax_kernels.h @@ -30,8 +30,8 @@ void forward_kernel(ffStream_t stream, float *output_ptr); void backward_kernel(ffStream_t stream, - float *input_grad_ptr, float const *output_grad_ptr, + float *input_grad_ptr, size_t num_elements); } // namespace Kernels::Softmax diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h index b48b7e0aa8..dbf78826cb 100644 --- a/lib/kernels/include/kernels/transpose_kernels.h +++ b/lib/kernels/include/kernels/transpose_kernels.h @@ -28,8 +28,8 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, TransposePerDeviceState const &m, - GenericTensorAccessorW const &in_grad, - GenericTensorAccessorR const &out_grad); + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad); } // namespace Kernels::Transpose } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc index 5a00503fe4..08f5552afc 100644 --- a/lib/kernels/src/cpu/cast_kernels.cc +++ b/lib/kernels/src/cpu/cast_kernels.cc @@ -28,11 +28,11 @@ struct CPUForwardKernel { template struct CPUBackwardKernel { - void operator()(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - size_t volume = input.shape.get_volume(); + void operator()(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + size_t volume = output.shape.get_volume(); cpu_cast_backward( - input.get(), output.get(), volume, cast_to(1.0f)); + output.get(), input.get(), volume, cast_to(1.0f)); } }; @@ -42,10 +42,10 @@ void cpu_forward_kernel(GenericTensorAccessorR const &input, input.data_type, output.data_type, input, output); } -void cpu_backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { DataTypeDispatch2{}( - input.data_type, output.data_type, input, output); + output.data_type, input.data_type, output, input); } } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu index 2ff02038f4..b30cf6a663 100644 --- a/lib/kernels/src/cuda/cuda_helper.cu +++ b/lib/kernels/src/cuda/cuda_helper.cu @@ -29,13 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) { #error "Unknown device, please make sure if CUDA is enabled" #endif -__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { +__global__ void scale_kernel(float *ptr, size_t size, float a, float b) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = (b - a) * ptr[i] + a; } } -__global__ void ones_kernel(float *ptr, coord_t size) { +__global__ void ones_kernel(float *ptr, size_t size) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = 1.0f; } @@ -49,7 +49,7 @@ __global__ void assign_kernel(DT *ptr, size_t size, DT value) { } template -__global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { +__global__ void copy_kernel(DT *dst, const DT *src, size_t size) { CUDA_KERNEL_LOOP(i, size) { dst[i] = src[i]; } @@ -281,11 +281,11 @@ template __global__ void add_kernel(bool *dst, bool const *src, unsigned long size); template __global__ void - copy_kernel(float *dst, float const *src, coord_t size); + copy_kernel(float *dst, float const *src, size_t size); template __global__ void - copy_kernel(int32_t *dst, int32_t const *src, coord_t size); + copy_kernel(int32_t *dst, int32_t const *src, size_t size); template __global__ void - copy_kernel(int64_t *dst, int64_t const *src, coord_t size); + copy_kernel(int64_t *dst, int64_t const *src, size_t size); template __global__ void apply_add_with_scale(float *data_ptr, float const *grad_ptr, diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu index e6a614ba70..c83e9f0a94 100644 --- a/lib/kernels/src/cuda/embedding_kernels.cu +++ b/lib/kernels/src/cuda/embedding_kernels.cu @@ -17,12 +17,11 @@ #include "kernels/datatype_dispatch.h" #include "kernels/embedding_kernels.h" -namespace FlexFlow { -namespace Kernels { -namespace Embedding { +namespace FlexFlow::Kernels::Embedding { void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); // Randomly initialize the intput tensor to avoid out of index range issues rand_generate_int<<>>( @@ -31,36 +30,14 @@ void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p) { cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); // Randomly initialize the intput tensor to avoid out of index range issues rand_generate_int<<>>( ptr, size, p); } -template -__global__ void embed_forward_no_aggr( - TI const *input, TD *output, TD const *embed, int out_dim, int batch_size); -template -__global__ void embed_forward_with_aggr(TI const *input, - TD *output, - TD const *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr); -template -__global__ void embed_backward_no_aggr( - TI const *input, TD const *output, TD *embed, int out_dim, int batch_size); -template -__global__ void embed_backward_with_aggr(TI const *input, - TD const *output, - TD *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr); - -template +template __global__ void embed_forward_no_aggr(int32_t const *input, TD *output, TD const *embed, @@ -75,7 +52,7 @@ __global__ void embed_forward_no_aggr(int32_t const *input, } } -template +template __global__ void embed_forward_no_aggr(int64_t const *input, TD *output, TD const *embed, @@ -90,14 +67,14 @@ __global__ void embed_forward_no_aggr(int64_t const *input, } } -template +template __global__ void embed_forward_with_aggr(int32_t const *input, TD *output, TD const *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; @@ -115,14 +92,14 @@ __global__ void embed_forward_with_aggr(int32_t const *input, } } -template +template __global__ void embed_forward_with_aggr(int64_t const *input, TD *output, TD const *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; @@ -140,7 +117,7 @@ __global__ void embed_forward_with_aggr(int64_t const *input, } } -template +template __global__ void embed_backward_no_aggr(int32_t const *input, TD const *output, TD *embed, @@ -154,7 +131,7 @@ __global__ void embed_backward_no_aggr(int32_t const *input, } } -template +template __global__ void embed_backward_no_aggr(int64_t const *input, TD const *output, TD *embed, @@ -171,11 +148,11 @@ __global__ void embed_backward_no_aggr(int64_t const *input, // Specialization for half type template <> -__global__ void embed_backward_no_aggr(int32_t const *input, - half const *output, - half *embed, - int out_dim, - int batch_size) { +__global__ void embed_backward_no_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; @@ -192,11 +169,11 @@ __global__ void embed_backward_no_aggr(int32_t const *input, } template <> -__global__ void embed_backward_no_aggr(int64_t const *input, - half const *output, - half *embed, - int out_dim, - int batch_size) { +__global__ void embed_backward_no_aggr(int64_t const *input, + half const *output, + half *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; @@ -212,14 +189,14 @@ __global__ void embed_backward_no_aggr(int64_t const *input, } } -template +template __global__ void embed_backward_with_aggr(int32_t const *input, TD const *output, TD *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -238,14 +215,14 @@ __global__ void embed_backward_with_aggr(int32_t const *input, } } -template +template __global__ void embed_backward_with_aggr(int64_t const *input, TD const *output, TD *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -267,14 +244,13 @@ __global__ void embed_backward_with_aggr(int64_t const *input, // Specialization for half type template <> -__global__ void - embed_backward_with_aggr(int32_t const *input, - half const *output, - half *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr) { +__global__ void embed_backward_with_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -301,14 +277,13 @@ __global__ void } template <> -__global__ void - embed_backward_with_aggr(int64_t const *input, - half const *output, - half *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr) { +__global__ void embed_backward_with_aggr(int64_t const *input, + half const *output, + half *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -351,35 +326,219 @@ struct ForwardKernel { int in_dim, int out_dim, int batch_size) { - assert(input.data_type == DataType::INT32 || - input.data_type == DataType::INT64); - assert(weight.data_type == DataType::HALF || - weight.data_type == DataType::FLOAT || - weight.data_type == DataType::DOUBLE); + throw mk_runtime_error(fmt::format( + "Invalid type combination: input type {} and output type {}", TI, TD)); + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { if (!aggr.has_value()) { - embed_forward_no_aggr, real_type_t> + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr <<>>(input.get(), - output.get(), - weight.get(), + stream>>>(input.get(), + output.get(), + weight.get(), out_dim, - batch_size); + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - embed_forward_with_aggr, real_type_t> + embed_forward_with_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr <<>>(input.get(), - output.get(), - weight.get(), + stream>>>(input.get(), + output.get(), + weight.get(), out_dim, in_dim, batch_size, - aggr); + aggr.value()); } } }; @@ -388,39 +547,229 @@ template struct BackwardKernel { void operator()(cudaStream_t stream, std::optional aggr, + GenericTensorAccessorR const &output, GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + throw mk_runtime_error(fmt::format( + "Invalid type combination: input type {} and output type {}", TI, TD)); + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, int in_dim, int out_dim, int batch_size) { - assert(input.data_type == DataType::INT32 || - input.data_type == DataType::INT64); - assert(output.data_type == DataType::HALF || - output.data_type == DataType::FLOAT || - output.data_type == DataType::DOUBLE); if (!aggr.has_value()) { - embed_backward_no_aggr, real_type_t> + embed_backward_no_aggr <<>>(input.get(), - output.get(), - weight_grad.get(), + stream>>>(input.get(), + output.get(), + weight_grad.get(), out_dim, batch_size); } else { - embed_backward_with_aggr, real_type_t> + embed_backward_with_aggr <<>>(input.get(), - output.get(), - weight_grad.get(), + stream>>>(input.get(), + output.get(), + weight_grad.get(), out_dim, in_dim, batch_size, - aggr); + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); } } }; @@ -448,27 +797,25 @@ void forward_kernel(ffStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorR const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, - DataType input_data_type, DataType output_data_type, + DataType input_data_type, std::optional aggr, int in_dim, int out_dim, int batch_size) { - DataTypeDispatch2{}(input_data_type, - output_data_type, + DataTypeDispatch2{}(output_data_type, + input_data_type, stream, aggr, - input, output, + input, weight_grad, in_dim, out_dim, batch_size); } -} // namespace Embedding -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Embedding diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu index 2e037eb472..2901f1d374 100644 --- a/lib/kernels/src/cuda/metrics_functions.cu +++ b/lib/kernels/src/cuda/metrics_functions.cu @@ -13,17 +13,42 @@ * limitations under the License. */ -#include "flexflow/model.h" -#include "flexflow/utils/cuda_helper.h" +#include "device.h" +#include "kernels/metrics_kernels.h" +#include "kernels/perf_metrics.h" +#include "pcg/metric.h" namespace FlexFlow { +struct CUDAPerfMetrics { + int train_all; + int train_correct; + float cce_loss; + float sparse_cce_loss; + float mse_loss; + float rmse_loss; + float mae_loss; + double start_time; + double current_time; + + CUDAPerfMetrics() = delete; + CUDAPerfMetrics(PerfMetrics const &perf) + : train_all(perf.train_all), + train_correct(perf.train_correct.value_or(-1)), + cce_loss(perf.cce_loss.value_or(-1)), + sparse_cce_loss(perf.sparse_cce_loss.value_or(-1)), + mse_loss(perf.mse_loss.value_or(-1)), + rmse_loss(perf.rmse_loss.value_or(-1)), + mae_loss(perf.mae_loss.value_or(-1)), start_time(perf.start_time), + current_time(perf.current_time) {} +}; + float const LOG_MIN_VALUE = 0.00000001f; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, - PerfMetrics *perf, - const Metrics metrics, + CUDAPerfMetrics *perf, + const MetricsAttrs metrics, int num_samples, int num_classes) { CUDA_KERNEL_LOOP(b, num_samples) { @@ -72,8 +97,8 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, __global__ void update_metrics_label_kernel(float const *logits, float const *labels, - PerfMetrics *perf, - const Metrics metrics, + CUDAPerfMetrics *perf, + const MetricsAttrs metrics, int num_samples, int num_classes) { CUDA_KERNEL_LOOP(b, num_samples) { @@ -136,17 +161,17 @@ __global__ void update_metrics_label_kernel(float const *logits, } } -void Metrics::update_metrics_sparse_label_kernel_wrapper( - float const *logit_ptr, - int const *label_ptr, - Metrics const *me, - int num_effective_samples, - int num_classes, - PerfMetrics &perf_zc) { - PerfMetrics *perf; - checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics))); - checkCUDA( - cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice)); +void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, + int const *label_ptr, + MetricsAttrs const *me, + int num_effective_samples, + int num_classes, + PerfMetrics &perf_zc) { + CUDAPerfMetrics perf(perf_zc); + CUDAPerfMetrics *perf_cuda; + checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics))); + checkCUDA(cudaMemcpy( + perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice)); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -154,32 +179,36 @@ void Metrics::update_metrics_sparse_label_kernel_wrapper( CUDA_NUM_THREADS, 0, stream>>>( - logit_ptr, label_ptr, perf, *me, num_effective_samples, num_classes); + logit_ptr, label_ptr, perf_cuda, *me, num_effective_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); - checkCUDA( - cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost)); - checkCUDA(cudaFree(perf)); + checkCUDA(cudaMemcpy( + &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); + checkCUDA(cudaFree(perf_cuda)); } -void Metrics::update_metrics_label_kernel_wrapper(float const *logit_ptr, - float const *label_ptr, - Metrics const *me, - int num_samples, - int num_classes, - PerfMetrics &perf_zc) { - PerfMetrics *perf; - checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics))); - checkCUDA( - cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice)); +void update_metrics_label_kernel_wrapper(float const *logit_ptr, + float const *label_ptr, + MetricsAttrs const *me, + int num_samples, + int num_classes, + PerfMetrics &perf_zc) { + CUDAPerfMetrics perf(perf_zc); + CUDAPerfMetrics *perf_cuda; + checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics))); + checkCUDA(cudaMemcpy( + perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice)); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - update_metrics_label_kernel<<>>( - logit_ptr, label_ptr, perf, *me, num_samples, num_classes); + update_metrics_label_kernel<<>>( + logit_ptr, label_ptr, perf_cuda, *me, num_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); - checkCUDA( - cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost)); - checkCUDA(cudaFree(perf)); + checkCUDA(cudaMemcpy( + &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); + checkCUDA(cudaFree(perf_cuda)); } }; // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu index 6c6e17a181..512981e32b 100644 --- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu +++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu @@ -53,9 +53,9 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, BatchNormPerDeviceState const &m, - float const *input_ptr, - float *output_grad_ptr, float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, float *input_grad_ptr, float const *scale_ptr, float *scale_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu index dc342fd0e0..afc3e1f7ef 100644 --- a/lib/kernels/src/cuda/ops/cast_kernels.cu +++ b/lib/kernels/src/cuda/ops/cast_kernels.cu @@ -50,11 +50,11 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - size_t volume = input.shape.get_volume(); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + size_t volume = output.shape.get_volume(); cast_backward<<>>( - input.get(), output.get(), volume, cast_to(1.0f)); + output.get(), input.get(), volume, cast_to(1.0f)); } }; @@ -66,10 +66,10 @@ void forward_kernel(ffStream_t stream, } void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { DataTypeDispatch2{}( - input.data_type, output.data_type, stream, input, output); + output.data_type, input.data_type, stream, output, input); } } // namespace Cast diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu index e3a4c97a31..0a4024ba8a 100644 --- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu @@ -313,10 +313,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *filter_ptr, float *filter_grad_ptr, float *bias_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu index a35d28fa8c..687a9fa220 100644 --- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu +++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu @@ -290,10 +290,10 @@ struct BackwardKernel { OperatorType op_type, std::optional scalar, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { checkCUDNN(cudnnSetStream(handle.dnn, stream)); if (use_cudnn(op_type)) { @@ -356,20 +356,20 @@ void backward_kernel(ffStream_t stream, ElementUnaryPerDeviceState const &device_state, ElementUnaryAttrs const &attrs, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { DataTypeDispatch1{}(input.data_type, stream, device_state, get_op_type(attrs), attrs.scalar, handle, - input, - input_grad, output, - output_grad); + output_grad, + input, + input_grad); } } // namespace ElementUnary diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu index 941db108a0..f661e5fb0a 100644 --- a/lib/kernels/src/cuda/ops/flat_kernels.cu +++ b/lib/kernels/src/cuda/ops/flat_kernels.cu @@ -34,8 +34,8 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, GenericTensorAccessorR input, - float *input_grad_ptr, - float const *output_grad_ptr) { + float const *output_grad_ptr, + float *input_grad_ptr) { float alpha = 1.0f; apply_add_with_scale diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu index 6b069218fa..0d5a772918 100644 --- a/lib/kernels/src/cuda/ops/linear_kernels.cu +++ b/lib/kernels/src/cuda/ops/linear_kernels.cu @@ -191,10 +191,10 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *kernel_ptr, float *kernel_grad_ptr, float *bias_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu index 1d07efb5fa..3687c1cedf 100644 --- a/lib/kernels/src/cuda/ops/partition_kernels.cu +++ b/lib/kernels/src/cuda/ops/partition_kernels.cu @@ -39,8 +39,8 @@ template struct BackwardKernel { void operator()(cudaStream_t stream, RepartitionPerDeviceState const &m, - GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { add_kernel><<{}( - m.data_type, stream, m, input_grad, output_grad); + m.data_type, stream, m, output_grad, input_grad); } } // namespace Repartition diff --git a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu index 51fa29d289..f8b35ec885 100644 --- a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu @@ -112,10 +112,10 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, Pool2DPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, void const *output_ptr, - void const *output_grad_ptr) { + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu index 0c6ba7d8e3..9c3e8dcc40 100644 --- a/lib/kernels/src/cuda/ops/reduction_kernels.cu +++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu @@ -54,8 +54,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { checkCUDA(cudaMemcpyAsync(input.get(), output.get(), input.shape.num_elements() * size_of_datatype(T), @@ -73,9 +73,9 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { - DataTypeDispatch1{}(input.data_type, stream, input, output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch1{}(output.data_type, stream, output, input); } } // namespace Reduction diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu index 5b7843a3a5..b7a328ca08 100644 --- a/lib/kernels/src/cuda/ops/reshape_kernels.cu +++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu @@ -42,8 +42,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { float alpha = 1.0f; apply_add_with_scale> <<{}(m.data_type, stream, input, output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch1{}(m.data_type, stream, output, input); } } // namespace Reshape diff --git a/lib/kernels/src/cuda/ops/softmax_kernels.cu b/lib/kernels/src/cuda/ops/softmax_kernels.cu index 93ed85de18..d2498d08a4 100644 --- a/lib/kernels/src/cuda/ops/softmax_kernels.cu +++ b/lib/kernels/src/cuda/ops/softmax_kernels.cu @@ -61,8 +61,8 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - float *input_grad_ptr, float const *output_grad_ptr, + float *input_grad_ptr, size_t num_elements) { checkCUDA(cudaMemcpyAsync(input_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu index 3b3f80944d..37e1a08326 100644 --- a/lib/kernels/src/cuda/ops/transpose_kernels.cu +++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu @@ -91,8 +91,8 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, TransposePerDeviceState const &m, - GenericTensorAccessorW const &in_grad, - GenericTensorAccessorR const &out_grad) { + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad) { TransposeStrides info; info.num_dim = in_grad.shape.num_dims(); diff --git a/lib/kernels/src/cuda/optimizer_kernel.cu b/lib/kernels/src/cuda/optimizer_kernels.cu similarity index 80% rename from lib/kernels/src/cuda/optimizer_kernel.cu rename to lib/kernels/src/cuda/optimizer_kernels.cu index 439eed9dec..237a277b21 100644 --- a/lib/kernels/src/cuda/optimizer_kernel.cu +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -13,7 +13,9 @@ * limitations under the License. */ +#include "device.h" #include "kernels/optimizer_kernels.h" +#include "utils/exception.h" namespace FlexFlow { @@ -80,13 +82,28 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); + + const auto& state = meta->raw_variant; + ncclComm_t comm = std::visit([](const auto& s) -> ncclComm_t { + using T = std::decay_t; + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + throw mk_runtime_error("State type does not support NCCL operations"); + } else { + return s.handle.ncclComm; + } + }, state); + checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); + (float *)w_grad_ptr, + size, + ncclFloat, + ncclSum, + comm, + stream)); + // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr); // print_tensor((float*)w_grad_ptr, 16, "[After ncclAllReduce]"); @@ -157,7 +174,7 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, for (int i = 1; i < num_replicas; i++) { float const *src = w_grad_ptr + i * size; add_kernel<<>>( - size, 1.0f, src, (float *)w_grad_ptr); + (float *)w_grad_ptr, src, size); } // checkCUDA(cudaDeviceSynchronize()); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", @@ -188,13 +205,27 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, // Use NCCL to sync gradients cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); + + const auto& state = meta->raw_variant; + ncclComm_t comm = std::visit([](const auto& s) -> ncclComm_t { + using T = std::decay_t; + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + throw mk_runtime_error("State type does not support NCCL operations"); + } else { + return s.handle.ncclComm; + } + }, state); + checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); + (float *)w_grad_ptr, + size, + ncclFloat, + ncclSum, + comm, + stream)); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", // op->alpha, op->alpha_t, op->weight_decay); // Step 2: Adam update diff --git a/lib/local-execution/include/local-execution/per_device_op_state.h b/lib/local-execution/include/local-execution/per_device_op_state.h index 1edd5b6360..f1f357a86e 100644 --- a/lib/local-execution/include/local-execution/per_device_op_state.h +++ b/lib/local-execution/include/local-execution/per_device_op_state.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H #define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H +#include "kernels/per_device_op_state.dtg.h" #include "local-execution/device_specific_device_states.dtg.h" -#include "local-execution/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 54c8dfc5f1..48584588e3 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H +#include "kernels/per_device_op_state.dtg.h" #include "local-execution/device_specific.h" #include "local-execution/itask_argument_accessor.h" -#include "local-execution/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc index 851566fc02..3aed3111c7 100644 --- a/lib/local-execution/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -133,9 +133,9 @@ static std::optional profiling, "[BatchNorm] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - output_grad.get_float_ptr(), output.get_float_ptr(), + output_grad.get_float_ptr(), + input.get_float_ptr(), input_grad.get_float_ptr(), scale.get_float_ptr(), scale_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc index d5c6e7f851..d7c5c22170 100644 --- a/lib/local-execution/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -108,8 +108,8 @@ static std::optional acc.get_argument(PER_DEVICE_STATE); auto attrs = acc.get_argument(ATTRS); - auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); auto filter = acc.get_tensor(FILTER); auto input_grad = acc.get_tensor_grad(INPUT); @@ -121,10 +121,10 @@ static std::optional profiling, "[Conv2d] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), filter.get_float_ptr(), filter_grad.get_float_ptr(), bias_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index 4ee609bd6c..10f1dce294 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -89,10 +89,10 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor_grad(INPUT); auto const &attrs = acc.get_argument(ATTRS); auto handle = acc.get_argument(HANDLE); @@ -107,10 +107,10 @@ static std::optional per_device_state, attrs, handle, - input, - input_grad, output, - output_grad); + output_grad, + input, + input_grad); } TaskImplFunction get_element_unary_init_task_impl() { diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc index 3fe5029fa1..8d998a8672 100644 --- a/lib/local-execution/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -41,15 +41,15 @@ static std::optional ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); return profile(backward_kernel, profiling, "[Flat] backward_time = {:.2lf}ms\n", input, - input_grad.get_float_ptr(), - output_grad.get_float_ptr()); + output_grad.get_float_ptr(), + input_grad.get_float_ptr()); } TaskImplFunction get_flat_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index fd2c1cd5e4..1eb0360db4 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -148,10 +148,10 @@ static std::optional profiling, "[Linear] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), weight.get_float_ptr(), weight_grad.get_float_ptr(), bias_ptr, diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc index 3ab33a2ad6..a1167a731c 100644 --- a/lib/local-execution/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -125,19 +125,19 @@ static std::optional Pool2DPerDeviceState state = acc.get_argument(PER_DEVICE_STATE); - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor(INPUT); return profile(backward_kernel, profiling, "[Pool2D] backward_time = {:.2lf}ms\n", state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), - output_grad.get_float_ptr()); + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr()); } TaskImplFunction get_pool_2d_init_task_impl() { diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc index a58d79a4f8..1e85d7186e 100644 --- a/lib/local-execution/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -64,13 +64,13 @@ static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); return profile(backward_kernel, profiling, "[Reduction] backward_time = {:.2lf}ms\n", - input_grad, - output_grad); + output_grad, + input_grad); } TaskImplFunction get_reduction_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc index 73692f4a13..655e1f238b 100644 --- a/lib/local-execution/src/ops/repartition.cc +++ b/lib/local-execution/src/ops/repartition.cc @@ -86,8 +86,8 @@ static std::optional ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(INPUT); + auto input_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc index 7584d405eb..761718a9a7 100644 --- a/lib/local-execution/src/ops/reshape.cc +++ b/lib/local-execution/src/ops/reshape.cc @@ -87,8 +87,8 @@ static std::optional profiling, "[Reshape] backward time = {:.2lf}ms\n", per_device_state, - input_grad, - output_grad); + output_grad, + input_grad); } TaskImplFunction get_reshape_init_task_impl() { diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc index 8d412c739b..71a6ce435e 100644 --- a/lib/local-execution/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -107,8 +107,8 @@ static std::optional return profile(backward_kernel, profiling, "[SoftMax] backward_time = {:.2lf}ms\n", - input_grad.get_float_ptr(), output_grad.get_float_ptr(), + input_grad.get_float_ptr(), output_grad.shape.get_volume()); } diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc index 53cf1f20ed..30310d3349 100644 --- a/lib/local-execution/src/ops/transpose.cc +++ b/lib/local-execution/src/ops/transpose.cc @@ -98,8 +98,8 @@ static std::optional profiling, "[Transpose] Backward_time = {:.2lf} [ms]", per_device_state, - input_grad, - output_grad); + output_grad, + input_grad); } OpTaskInvocation backward(TransposeAttrs const &attrs) { diff --git a/lib/local-execution/src/per_device_state.cc b/lib/local-execution/src/per_device_op_state.cc similarity index 100% rename from lib/local-execution/src/per_device_state.cc rename to lib/local-execution/src/per_device_op_state.cc diff --git a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml index 27aa50f38f..2c524c120a 100644 --- a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml +++ b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml @@ -10,5 +10,8 @@ features = [ [[values]] name = "SUM" -[[value]] +[[values]] name = "AVG" + +[[values]] +name = "NONE" diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h new file mode 100644 index 0000000000..723e69bddd --- /dev/null +++ b/lib/op-attrs/include/op-attrs/datatype_value.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H + +#include "op-attrs/datatype_value.dtg.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value); +DataTypeValue make_double_data_type_value(double value); +DataTypeValue make_int32_data_type_value(int32_t value); +DataTypeValue make_int64_data_type_value(int64_t value); +DataTypeValue make_bool_data_type_value(bool value); + +} // namespace FlexFlow + +#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H diff --git a/lib/op-attrs/include/op-attrs/make_datatype_value.h b/lib/op-attrs/include/op-attrs/make_datatype_value.h index c3289c6309..af4792dd9e 100644 --- a/lib/op-attrs/include/op-attrs/make_datatype_value.h +++ b/lib/op-attrs/include/op-attrs/make_datatype_value.h @@ -11,6 +11,6 @@ DataTypeValue make_int32_data_type_value(int32_t value); DataTypeValue make_int64_data_type_value(int64_t value); DataTypeValue make_bool_data_type_value(bool value); -} +} // namespace FlexFlow #endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H diff --git a/lib/op-attrs/src/op-attrs/make_datatype_value.cc b/lib/op-attrs/src/op-attrs/make_datatype_value.cc index bc402c433c..76d712949a 100644 --- a/lib/op-attrs/src/op-attrs/make_datatype_value.cc +++ b/lib/op-attrs/src/op-attrs/make_datatype_value.cc @@ -11,15 +11,15 @@ DataTypeValue make_double_data_type_value(double value) { } DataTypeValue make_int32_data_type_value(int32_t value) { - return DataTypeValue{value}; + return DataTypeValue{value}; } DataTypeValue make_int64_data_type_value(int64_t value) { - return DataTypeValue{value}; + return DataTypeValue{value}; } DataTypeValue make_bool_data_type_value(bool value) { - return DataTypeValue{value}; -} - + return DataTypeValue{value}; } + +} // namespace FlexFlow diff --git a/lib/pcg/include/pcg/metric.h b/lib/pcg/include/pcg/metric.h new file mode 100644 index 0000000000..f56078772e --- /dev/null +++ b/lib/pcg/include/pcg/metric.h @@ -0,0 +1,73 @@ +#ifndef _FF_METRICS_H_ +#define _FF_METRICS_H_ + +#include +#include "utils/fmt.h" +#include "op-attrs/ops/loss_functions/loss_functions.h" + +namespace FlexFlow { + +enum class Metric { + ACCURACY, + CATEGORICAL_CROSSENTROPY, + SPARSE_CATEGORICAL_CROSSENTROPY, + MEAN_SQUARED_ERROR, + ROOT_MEAN_SQUARED_ERROR, + MEAN_ABSOLUTE_ERROR, +}; + +class MetricsAttrs { +public: + MetricsAttrs() = delete; + MetricsAttrs(LossFunction, std::vector const &); + +public: + LossFunction loss_type; + bool measure_accuracy; + bool measure_categorical_crossentropy; + bool measure_sparse_categorical_crossentropy; + bool measure_mean_squared_error; + bool measure_root_mean_squared_error; + bool measure_mean_absolute_error; +}; + +} // namespace FlexFlow + +namespace fmt { + +template <> +struct formatter<::FlexFlow::Metric> : formatter { + template + auto format(::FlexFlow::Metric m, FormatContext &ctx) const + -> decltype(ctx.out()) { + using namespace FlexFlow; + + string_view name = "unknown"; + switch (m) { + case Metric::ACCURACY: + name = "Accuracy"; + break; + case Metric::CATEGORICAL_CROSSENTROPY: + name = "CategoricalCrossEntropy"; + break; + case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: + name = "SparseCategoricalCrossEntropy"; + break; + case Metric::MEAN_SQUARED_ERROR: + name = "MeanSquaredError"; + break; + case Metric::ROOT_MEAN_SQUARED_ERROR: + name = "RootMeanSquaredError"; + break; + case Metric::MEAN_ABSOLUTE_ERROR: + name = "MeanAbsoluteError"; + break; + } + return formatter::format(name, ctx); + } +}; + +} // namespace fmt + + +#endif diff --git a/lib/pcg/src/pcg/metric.cc b/lib/pcg/src/pcg/metric.cc new file mode 100644 index 0000000000..eb0d6bc5d0 --- /dev/null +++ b/lib/pcg/src/pcg/metric.cc @@ -0,0 +1,38 @@ +#include "pcg/metric.h" + +namespace FlexFlow { +MetricsAttrs::MetricsAttrs(LossFunction _loss_type, + std::vector const &metrics) + : loss_type(_loss_type), measure_accuracy(false), + measure_categorical_crossentropy(false), + measure_sparse_categorical_crossentropy(false), + measure_mean_squared_error(false), measure_root_mean_squared_error(false), + measure_mean_absolute_error(false) { +for (Metric const &m : metrics) { + switch (m) { + case Metric::ACCURACY: + measure_accuracy = true; + continue; + case Metric::CATEGORICAL_CROSSENTROPY: + measure_categorical_crossentropy = true; + continue; + case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: + measure_sparse_categorical_crossentropy = true; + continue; + case Metric::MEAN_SQUARED_ERROR: + measure_mean_squared_error = true; + continue; + case Metric::ROOT_MEAN_SQUARED_ERROR: + measure_root_mean_squared_error = true; + continue; + case Metric::MEAN_ABSOLUTE_ERROR: + measure_mean_absolute_error = true; + continue; + default: + throw mk_runtime_error("Initializing MetricsAttrs with unrecogonized metrics type"); + } +} +} + + +} diff --git a/lib/runtime/src/metrics_functions.cc b/lib/runtime/src/metrics_functions.cc index feb6e704b2..33e15baed2 100644 --- a/lib/runtime/src/metrics_functions.cc +++ b/lib/runtime/src/metrics_functions.cc @@ -25,39 +25,6 @@ namespace FlexFlow { LegionRuntime::Logger::Category log_metrics("metrics"); -MetricsAttrs::MetricsAttrs(LossFunction _loss_type, - std::vector const &metrics) - : loss_type(_loss_type), measure_accuracy(false), - measure_categorical_crossentropy(false), - measure_sparse_categorical_crossentropy(false), - measure_mean_squared_error(false), measure_root_mean_squared_error(false), - measure_mean_absolute_error(false) { - for (Metric const &m : metrics) { - switch (m) { - case Metric::ACCURACY: - measure_accuracy = true; - continue; - case Metric::CATEGORICAL_CROSSENTROPY: - measure_categorical_crossentropy = true; - continue; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - measure_sparse_categorical_crossentropy = true; - continue; - case Metric::MEAN_SQUARED_ERROR: - measure_mean_squared_error = true; - continue; - case Metric::ROOT_MEAN_SQUARED_ERROR: - measure_root_mean_squared_error = true; - continue; - case Metric::MEAN_ABSOLUTE_ERROR: - measure_mean_absolute_error = true; - continue; - default: - throw mk_runtime_error("Unrecogonized metrics type {}", m); - } - } -} - enum Slots { LOGIT, LABEL, diff --git a/lib/runtime/src/metrics_functions.h b/lib/runtime/src/metrics_functions.h index fbb0b633bf..73dc3bbc51 100644 --- a/lib/runtime/src/metrics_functions.h +++ b/lib/runtime/src/metrics_functions.h @@ -16,38 +16,13 @@ #ifndef _FF_METRICS_FUNCTIONS_H_ #define _FF_METRICS_FUNCTIONS_H_ +#include "kernels/metric.h" #include "kernels/perf_metrics.h" #include "legion.h" -#include "op-attrs/ops/loss_functions.h" #include "task_spec/task_invocation.h" -#include "utils/fmt.h" namespace FlexFlow { -enum class Metric { - ACCURACY, - CATEGORICAL_CROSSENTROPY, - SPARSE_CATEGORICAL_CROSSENTROPY, - MEAN_SQUARED_ERROR, - ROOT_MEAN_SQUARED_ERROR, - MEAN_ABSOLUTE_ERROR, -}; - -class MetricsAttrs { -public: - MetricsAttrs() = delete; - MetricsAttrs(LossFunction, std::vector const &); - -public: - LossFunction loss_type; - bool measure_accuracy; - bool measure_categorical_crossentropy; - bool measure_sparse_categorical_crossentropy; - bool measure_mean_squared_error; - bool measure_root_mean_squared_error; - bool measure_mean_absolute_error; -}; - TypedIndexTaskInvocation compute_metrics(MetricsAttrs const &, parallel_tensor_guid_t const &logit, @@ -79,40 +54,4 @@ VISITABLE_STRUCT(::FlexFlow::MetricsAttrs, measure_root_mean_squared_error, measure_mean_absolute_error); -namespace fmt { - -template <> -struct formatter<::FlexFlow::Metric> : formatter { - template - auto format(::FlexFlow::Metric m, FormatContext &ctx) const - -> decltype(ctx.out()) { - using namespace FlexFlow; - - string_view name = "unknown"; - switch (m) { - case Metric::ACCURACY: - name = "Accuracy"; - break; - case Metric::CATEGORICAL_CROSSENTROPY: - name = "CategoricalCrossEntropy"; - break; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - name = "SparseCategoricalCrossEntropy"; - break; - case Metric::MEAN_SQUARED_ERROR: - name = "MeanSquaredError"; - break; - case Metric::ROOT_MEAN_SQUARED_ERROR: - name = "RootMeanSquaredError"; - break; - case Metric::MEAN_ABSOLUTE_ERROR: - name = "MeanAbsoluteError"; - break; - } - return formatter::format(name, ctx); - } -}; - -} // namespace fmt - #endif diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc index 253fd3cb4f..83e7c15460 100644 --- a/lib/runtime/src/ops/embedding.cc +++ b/lib/runtime/src/ops/embedding.cc @@ -77,11 +77,11 @@ static std::optional return profile(backward_kernel, profiling, "[Embedding] backward_time = {:.2lf}ms\n", - input, output, + input, weight_grad, - input.data_type, output.data_type, + input.data_type, attrs.aggr, input.shape.get_dim(), output.shape.get_dim(), From c64a55c3cfbad062d3fa6fd6b705c4cdb7509fac Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Thu, 21 Nov 2024 22:46:25 -0800 Subject: [PATCH 20/42] format check --- lib/kernels/include/kernels/pool_2d_kernels.h | 1 - lib/kernels/src/cuda/metrics_functions.cu | 5 +- lib/kernels/src/cuda/optimizer_kernels.cu | 76 +++++++++---------- lib/pcg/include/pcg/metric.h | 5 +- lib/pcg/src/pcg/metric.cc | 62 +++++++-------- 5 files changed, 69 insertions(+), 80 deletions(-) diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h index c0e57e2c9a..ad0a52efb9 100644 --- a/lib/kernels/include/kernels/pool_2d_kernels.h +++ b/lib/kernels/include/kernels/pool_2d_kernels.h @@ -74,7 +74,6 @@ void backward_kernel(cudaStream_t stream, void const *input_ptr, void *input_grad_ptr); - } // namespace Kernels::Pool2D } // namespace FlexFlow diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu index 2901f1d374..0250f829ec 100644 --- a/lib/kernels/src/cuda/metrics_functions.cu +++ b/lib/kernels/src/cuda/metrics_functions.cu @@ -200,10 +200,7 @@ void update_metrics_label_kernel_wrapper(float const *logit_ptr, cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - update_metrics_label_kernel<<>>( + update_metrics_label_kernel<<>>( logit_ptr, label_ptr, perf_cuda, *me, num_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); checkCUDA(cudaMemcpy( diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu index 237a277b21..1c6954a0b0 100644 --- a/lib/kernels/src/cuda/optimizer_kernels.cu +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -83,26 +83,23 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - const auto& state = meta->raw_variant; - ncclComm_t comm = std::visit([](const auto& s) -> ncclComm_t { - using T = std::decay_t; - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v) { - throw mk_runtime_error("State type does not support NCCL operations"); - } else { - return s.handle.ncclComm; - } - }, state); - - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - comm, - stream)); + auto const &state = meta->raw_variant; + ncclComm_t comm = std::visit( + [](auto const &s) -> ncclComm_t { + using T = std::decay_t; + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + throw mk_runtime_error("State type does not support NCCL operations"); + } else { + return s.handle.ncclComm; + } + }, + state); + + checkNCCL(ncclAllReduce( + w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream)); // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr); // print_tensor((float*)w_grad_ptr, 16, "[After ncclAllReduce]"); @@ -205,27 +202,24 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, // Use NCCL to sync gradients cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - - const auto& state = meta->raw_variant; - ncclComm_t comm = std::visit([](const auto& s) -> ncclComm_t { - using T = std::decay_t; - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v) { - throw mk_runtime_error("State type does not support NCCL operations"); - } else { - return s.handle.ncclComm; - } - }, state); - - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - comm, - stream)); + + auto const &state = meta->raw_variant; + ncclComm_t comm = std::visit( + [](auto const &s) -> ncclComm_t { + using T = std::decay_t; + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + throw mk_runtime_error("State type does not support NCCL operations"); + } else { + return s.handle.ncclComm; + } + }, + state); + + checkNCCL(ncclAllReduce( + w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream)); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", // op->alpha, op->alpha_t, op->weight_decay); // Step 2: Adam update diff --git a/lib/pcg/include/pcg/metric.h b/lib/pcg/include/pcg/metric.h index f56078772e..718919112f 100644 --- a/lib/pcg/include/pcg/metric.h +++ b/lib/pcg/include/pcg/metric.h @@ -1,9 +1,9 @@ #ifndef _FF_METRICS_H_ #define _FF_METRICS_H_ -#include -#include "utils/fmt.h" #include "op-attrs/ops/loss_functions/loss_functions.h" +#include "utils/fmt.h" +#include namespace FlexFlow { @@ -69,5 +69,4 @@ struct formatter<::FlexFlow::Metric> : formatter { } // namespace fmt - #endif diff --git a/lib/pcg/src/pcg/metric.cc b/lib/pcg/src/pcg/metric.cc index eb0d6bc5d0..69aba90d12 100644 --- a/lib/pcg/src/pcg/metric.cc +++ b/lib/pcg/src/pcg/metric.cc @@ -2,37 +2,37 @@ namespace FlexFlow { MetricsAttrs::MetricsAttrs(LossFunction _loss_type, - std::vector const &metrics) - : loss_type(_loss_type), measure_accuracy(false), - measure_categorical_crossentropy(false), - measure_sparse_categorical_crossentropy(false), - measure_mean_squared_error(false), measure_root_mean_squared_error(false), - measure_mean_absolute_error(false) { -for (Metric const &m : metrics) { - switch (m) { - case Metric::ACCURACY: - measure_accuracy = true; - continue; - case Metric::CATEGORICAL_CROSSENTROPY: - measure_categorical_crossentropy = true; - continue; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - measure_sparse_categorical_crossentropy = true; - continue; - case Metric::MEAN_SQUARED_ERROR: - measure_mean_squared_error = true; - continue; - case Metric::ROOT_MEAN_SQUARED_ERROR: - measure_root_mean_squared_error = true; - continue; - case Metric::MEAN_ABSOLUTE_ERROR: - measure_mean_absolute_error = true; - continue; - default: - throw mk_runtime_error("Initializing MetricsAttrs with unrecogonized metrics type"); + std::vector const &metrics) + : loss_type(_loss_type), measure_accuracy(false), + measure_categorical_crossentropy(false), + measure_sparse_categorical_crossentropy(false), + measure_mean_squared_error(false), measure_root_mean_squared_error(false), + measure_mean_absolute_error(false) { + for (Metric const &m : metrics) { + switch (m) { + case Metric::ACCURACY: + measure_accuracy = true; + continue; + case Metric::CATEGORICAL_CROSSENTROPY: + measure_categorical_crossentropy = true; + continue; + case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: + measure_sparse_categorical_crossentropy = true; + continue; + case Metric::MEAN_SQUARED_ERROR: + measure_mean_squared_error = true; + continue; + case Metric::ROOT_MEAN_SQUARED_ERROR: + measure_root_mean_squared_error = true; + continue; + case Metric::MEAN_ABSOLUTE_ERROR: + measure_mean_absolute_error = true; + continue; + default: + throw mk_runtime_error( + "Initializing MetricsAttrs with unrecogonized metrics type"); + } } } -} - -} +} // namespace FlexFlow From a091652370cb2a2c29d60100253fd6fba2882307 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 27 Jan 2025 20:57:10 -0800 Subject: [PATCH 21/42] branch merge and test fixes --- lib/kernels/include/kernels/accessor.h | 76 ++++-------- lib/kernels/include/kernels/flat_kernels.h | 2 +- .../include/kernels/loss_function_kernels.h | 2 +- .../include/kernels/managed_ff_stream.h | 1 + .../kernels/managed_per_device_ff_handle.h | 1 + lib/kernels/include/kernels/metrics_kernels.h | 6 +- lib/kernels/include/kernels/pool_2d_kernels.h | 2 +- lib/kernels/src/cuda/metrics_functions.cu | 10 +- lib/kernels/src/hip/embedding_kernels.cpp | 30 ++--- .../test/src/test_batch_norm_kernel.cc | 2 +- lib/kernels/test/src/test_concat_kernel.cc | 2 +- lib/kernels/test/src/test_flat_kernel.cc | 2 +- .../test/src/test_layer_norm_kernels.cc | 2 +- .../src/test_managed_per_device_ff_handle.cc | 7 +- lib/kernels/test/src/test_partition_kernel.cc | 2 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 2 +- lib/kernels/test/src/test_reduction_kernel.cc | 2 +- lib/kernels/test/src/test_reverse_kernels.cc | 2 +- lib/kernels/test/src/test_split_kernel.cc | 2 +- lib/kernels/test/src/test_transpose_kernel.cc | 3 +- lib/kernels/test/src/test_utils.cc | 18 ++- lib/kernels/test/src/test_utils.h | 2 +- .../test/src/test_local_cost_estimator.cc | 115 +++++++++--------- .../include/op-attrs/aggregate_op.enum.toml | 2 - .../include/op-attrs/make_datatype_value.h | 16 --- ...ke_datatype_value.cc => datatype_value.cc} | 2 +- .../test/src/op-attrs/datatype_value.cc | 68 +++++++++++ lib/pcg/include/pcg/metric.enum.toml | 26 ++++ lib/pcg/include/pcg/metric.h | 72 ----------- lib/pcg/include/pcg/metric_attrs.h | 28 +++++ lib/pcg/include/pcg/strided_rectangle.h | 17 --- lib/pcg/src/pcg/computation_graph_builder.cc | 2 +- lib/pcg/src/pcg/metric.cc | 8 +- .../parallel_computation_graph_builder.cc | 2 +- lib/pcg/src/pcg/strided_rectangle_side.cc | 17 --- lib/pcg/src/strided_rectangle.cc | 35 ------ lib/pcg/test/src/test_machine_view.cc | 74 ----------- lib/pcg/test/src/test_strided_rectangle.cc | 37 ------ 38 files changed, 263 insertions(+), 436 deletions(-) delete mode 100644 lib/op-attrs/include/op-attrs/make_datatype_value.h rename lib/op-attrs/src/op-attrs/{make_datatype_value.cc => datatype_value.cc} (92%) create mode 100644 lib/op-attrs/test/src/op-attrs/datatype_value.cc create mode 100644 lib/pcg/include/pcg/metric.enum.toml delete mode 100644 lib/pcg/include/pcg/metric.h create mode 100644 lib/pcg/include/pcg/metric_attrs.h delete mode 100644 lib/pcg/include/pcg/strided_rectangle.h delete mode 100644 lib/pcg/src/pcg/strided_rectangle_side.cc delete mode 100644 lib/pcg/src/strided_rectangle.cc delete mode 100644 lib/pcg/test/src/test_machine_view.cc delete mode 100644 lib/pcg/test/src/test_strided_rectangle.cc diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 487bc1f8f0..a6fc4129e0 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -11,6 +11,28 @@ namespace FlexFlow { +inline int calculate_accessor_offset(std::vector const &indices, + ArrayShape const &shape) { + int offset = 0; + int multiplier = 1; + + for (int i = 0; i < shape.num_dims(); i++) { + if (indices.at(i) >= shape.at(legion_dim_t{i})) { + throw mk_runtime_error( + fmt::format("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + indices.at(i), + shape.at(legion_dim_t{i}))); + } + + offset += indices.at(i) * multiplier; + multiplier *= shape.at(legion_dim_t{i}); + } + + return offset; +} + class GenericTensorAccessorR { public: template @@ -57,23 +79,7 @@ class GenericTensorAccessorR { using T = real_type_t
; T const *data_ptr = static_cast(this->ptr); - - int offset = 0; - int multiplier = 1; - for (int i = 0; i < this->shape.num_dims(); i++) { - if (indices.at(i) >= this->shape.at(legion_dim_t{i})) { - throw mk_runtime_error( - fmt::format("In {} dimension, attempting to access index {} " - "when only {} indexes exist", - i, - indices.at(i), - this->shape.at(legion_dim_t{i}))); - } - - offset += indices.at(i) * multiplier; - multiplier *= this->shape.at(legion_dim_t{i}); - } - + int offset = calculate_accessor_offset(indices, this->shape); return data_ptr[offset]; } @@ -141,24 +147,8 @@ class GenericTensorAccessorW { } using T = real_type_t
; - T *data_ptr = static_cast(this->ptr); - int offset = 0; - int multiplier = 1; - for (int i = 0; i < this->shape.num_dims(); i++) { - if (indices.at(i) >= this->shape.at(legion_dim_t{i})) { - throw mk_runtime_error( - fmt::format("In {} dimension, attempting to access index {} " - "when only {} indexes exist", - i, - indices.at(i), - this->shape.at(legion_dim_t{i}))); - } - - offset += indices.at(i) * multiplier; - multiplier *= this->shape.at(legion_dim_t{i}); - } - + int offset = calculate_accessor_offset(indices, this->shape); return data_ptr[offset]; } @@ -179,24 +169,8 @@ class GenericTensorAccessorW { } using T = real_type_t
; - T const *data_ptr = static_cast(this->ptr); - int offset = 0; - int multiplier = 1; - for (int i = 0; i < this->shape.num_dims(); i++) { - if (indices.at(i) >= this->shape.at(legion_dim_t{i})) { - throw mk_runtime_error( - fmt::format("In {} dimension, attempting to access index {} " - "when only {} indexes exist", - i, - indices.at(i), - this->shape.at(legion_dim_t{i}))); - } - - offset += indices.at(i) * multiplier; - multiplier *= this->shape.at(legion_dim_t{i}); - } - + int offset = calculate_accessor_offset(indices, this->shape); return data_ptr[offset]; } diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h index d60a1a5157..54839bd7fa 100644 --- a/lib/kernels/include/kernels/flat_kernels.h +++ b/lib/kernels/include/kernels/flat_kernels.h @@ -10,7 +10,7 @@ void forward_kernel(ffStream_t stream, GenericTensorAccessorR input, float *output_ptr); -void backward_kernel(cudaStream_t stream, +void backward_kernel(ffStream_t stream, GenericTensorAccessorR input, float const *output_grad_ptr, float *input_grad_ptr); diff --git a/lib/kernels/include/kernels/loss_function_kernels.h b/lib/kernels/include/kernels/loss_function_kernels.h index 9e0dbd4ba1..bab404f884 100644 --- a/lib/kernels/include/kernels/loss_function_kernels.h +++ b/lib/kernels/include/kernels/loss_function_kernels.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H -#include "device.h" +#include "kernels/device.h" namespace FlexFlow { diff --git a/lib/kernels/include/kernels/managed_ff_stream.h b/lib/kernels/include/kernels/managed_ff_stream.h index 26d5fb4911..7f103ea560 100644 --- a/lib/kernels/include/kernels/managed_ff_stream.h +++ b/lib/kernels/include/kernels/managed_ff_stream.h @@ -19,6 +19,7 @@ struct ManagedFFStream { ffStream_t const &raw_stream() const; +private: void cleanup(); private: diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index 035ea574de..9bd9370685 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -24,6 +24,7 @@ struct ManagedPerDeviceFFHandle { PerDeviceFFHandle const &raw_handle() const; +private: void cleanup(); private: diff --git a/lib/kernels/include/kernels/metrics_kernels.h b/lib/kernels/include/kernels/metrics_kernels.h index d961ee7503..430608db55 100644 --- a/lib/kernels/include/kernels/metrics_kernels.h +++ b/lib/kernels/include/kernels/metrics_kernels.h @@ -2,20 +2,20 @@ #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H #include "kernels/perf_metrics.h" -#include "pcg/metric.h" +#include "pcg/metric_attrs.h" namespace FlexFlow { void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, int const *label_ptr, - MetricsAttrs const *me, + MetricsAttrs const &me, int num_effective_samples, int num_classes, PerfMetrics &perf_zc); void update_metrics_label_kernel_wrapper(float const *logit_ptr, float const *label_ptr, - MetricsAttrs const *me, + MetricsAttrs const &me, int num_samples, int num_classes, PerfMetrics &perf_zc); diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h index ad0a52efb9..9650859a18 100644 --- a/lib/kernels/include/kernels/pool_2d_kernels.h +++ b/lib/kernels/include/kernels/pool_2d_kernels.h @@ -67,7 +67,7 @@ void forward_kernel(ffStream_t stream, void const *input_ptr, void *output_ptr); -void backward_kernel(cudaStream_t stream, +void backward_kernel(ffStream_t stream, Pool2DPerDeviceState const &m, void const *output_ptr, void const *output_grad_ptr, diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu index 0250f829ec..112f84c90c 100644 --- a/lib/kernels/src/cuda/metrics_functions.cu +++ b/lib/kernels/src/cuda/metrics_functions.cu @@ -16,7 +16,7 @@ #include "device.h" #include "kernels/metrics_kernels.h" #include "kernels/perf_metrics.h" -#include "pcg/metric.h" +#include "pcg/metric_attrs.h" namespace FlexFlow { @@ -163,7 +163,7 @@ __global__ void update_metrics_label_kernel(float const *logits, void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, int const *label_ptr, - MetricsAttrs const *me, + MetricsAttrs const &me, int num_effective_samples, int num_classes, PerfMetrics &perf_zc) { @@ -179,7 +179,7 @@ void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, CUDA_NUM_THREADS, 0, stream>>>( - logit_ptr, label_ptr, perf_cuda, *me, num_effective_samples, num_classes); + logit_ptr, label_ptr, perf_cuda, me, num_effective_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); checkCUDA(cudaMemcpy( &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); @@ -188,7 +188,7 @@ void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, void update_metrics_label_kernel_wrapper(float const *logit_ptr, float const *label_ptr, - MetricsAttrs const *me, + MetricsAttrs const &me, int num_samples, int num_classes, PerfMetrics &perf_zc) { @@ -201,7 +201,7 @@ void update_metrics_label_kernel_wrapper(float const *logit_ptr, cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); update_metrics_label_kernel<<>>( - logit_ptr, label_ptr, perf_cuda, *me, num_samples, num_classes); + logit_ptr, label_ptr, perf_cuda, me, num_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); checkCUDA(cudaMemcpy( &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); diff --git a/lib/kernels/src/hip/embedding_kernels.cpp b/lib/kernels/src/hip/embedding_kernels.cpp index 7ca3149f2f..06b42d420a 100644 --- a/lib/kernels/src/hip/embedding_kernels.cpp +++ b/lib/kernels/src/hip/embedding_kernels.cpp @@ -364,8 +364,8 @@ struct ForwardKernel { weight.data_type == DataType::FLOAT || weight.data_type == DataType::DOUBLE); - if (aggr == AggregateOp::NONE) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr), + if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -374,10 +374,11 @@ struct ForwardKernel { output.get(), weight.get(), out_dim, - batch_size); + in_dim, + batch_size, + aggr); } else { - assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr), + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -386,9 +387,7 @@ struct ForwardKernel { output.get(), weight.get(), out_dim, - in_dim, - batch_size, - aggr); + batch_size); } } } @@ -408,8 +407,9 @@ struct BackwardKernel { assert(output.data_type == DataType::HALF || output.data_type == DataType::FLOAT || output.data_type == DataType::DOUBLE); - if (aggr == AggregateOp::NONE) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr), + + if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -418,9 +418,11 @@ struct BackwardKernel { output.get(), weight_grad.get(), out_dim, - batch_size); + in_dim, + batch_size, + aggr); } else { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr), + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -429,9 +431,7 @@ struct BackwardKernel { output.get(), weight_grad.get(), out_dim, - in_dim, - batch_size, - aggr); + batch_size); } } } diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 03a3a1ad40..270fad7bb6 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -1,6 +1,6 @@ #include "doctest/doctest.h" #include "kernels/batch_norm_kernels.h" -#include "op-attrs/make_datatype_value.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 4607171a54..5447b12fc5 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -8,7 +8,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test concat kernel forward and backward") { size_t num_inputs = 2; size_t size_per_input = 10; - ff_dim_t concat_axis = ff_dim_t{1}; + ff_dim_t concat_axis = ff_dim_t{nonnegative_int{1}}; ManagedPerDeviceFFHandle managed_handle{ /*workSpaceSize=*/1024 * 1024, diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 0bb69aa1dc..bbeb349ced 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -1,6 +1,6 @@ #include "doctest/doctest.h" #include "kernels/flat_kernels.h" -#include "op-attrs/make_datatype_value.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 7d7298f83d..80a046fe37 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -1,6 +1,6 @@ #include "doctest/doctest.h" #include "kernels/layer_norm_kernels.h" -#include "op-attrs/make_datatype_value.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc index de3e5b72b1..d081a0b07c 100644 --- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -5,7 +5,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ManagedPerDeviceFFHandle") { - ManagedPerDeviceFFHandle base_handle{1024 * 1024, true}; + ManagedPerDeviceFFHandle base_handle{/*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); SUBCASE("constructor") { @@ -22,7 +23,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("move assignment operator") { SUBCASE("move assign to other") { - ManagedPerDeviceFFHandle new_handle{1024 * 1024, true}; + ManagedPerDeviceFFHandle new_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; new_handle = std::move(base_handle); CHECK(&base_handle.raw_handle() == nullptr); diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index e88c811803..25264b7a58 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -1,6 +1,6 @@ #include "doctest/doctest.h" #include "kernels/partition_kernels.h" -#include "op-attrs/make_datatype_value.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 00fa968235..eb0702a970 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -1,6 +1,6 @@ #include "doctest/doctest.h" #include "kernels/pool_2d_kernels.h" -#include "op-attrs/make_datatype_value.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 1c389cb20d..a33748c0de 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -1,6 +1,6 @@ #include "doctest/doctest.h" #include "kernels/reduction_kernels.h" -#include "op-attrs/make_datatype_value.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 4adf79847a..c06919d603 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -1,7 +1,7 @@ #include "doctest/doctest.h" #include "kernels/reverse_kernels.h" #include "kernels/reverse_kernels_cpu.h" -#include "op-attrs/make_datatype_value.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index 34993fa151..e94d102b71 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -1,6 +1,6 @@ #include "doctest/doctest.h" #include "kernels/split_kernels.h" -#include "op-attrs/make_datatype_value.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" #include "utils/containers/repeat.h" diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index 0bc85cb8e0..f87fb67921 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -7,7 +7,8 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Transpose Kernel Operations") { std::size_t num_dims = 2; - std::vector perm = {ff_dim_t{0}, ff_dim_t{1}}; + std::vector perm = {ff_dim_t{nonnegative_int{0}}, + ff_dim_t{nonnegative_int{1}}}; ManagedPerDeviceFFHandle managed_handle{ /*workSpaceSize=*/1024 * 1024, diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index bfed1241ba..c75abd50ff 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -1,5 +1,6 @@ #include "test_utils.h" #include "op-attrs/tensor_shape.h" +#include "utils/join_strings.h" #include namespace FlexFlow { @@ -140,21 +141,16 @@ template struct Print2DCPUAccessorR { void operator()(GenericTensorAccessorR const &accessor, std::ostream &stream) { - using T = real_type_t
; - - T const *data_ptr = accessor.get
(); int rows = accessor.shape.at(legion_dim_t{0}); int cols = accessor.shape.at(legion_dim_t{1}); - for (int i = 0; i < rows; i++) { - for (int j = 0; j < cols; j++) { - stream << data_ptr[i * cols + j]; + std::vector indices(cols); + std::iota(indices.begin(), indices.end(), 0); - if (j < cols - 1) { - stream << " "; - } - } - stream << std::endl; + for (int i = 0; i < rows; i++) { + stream << join_strings(indices, " ", [&](int k) { + return accessor.at
({i, k}); + }) << std::endl; } } }; diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index 19599d2900..a41bfc3aff 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -15,7 +15,7 @@ #include #include -using namespace FlexFlow; +using namespace ::FlexFlow; GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, Allocator &allocator); diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index 512c1ef33b..9f8b4092c1 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -13,71 +13,70 @@ // TEST_CASE("Local Cost Estimator") { // // local backing initialization // ManagedPerDeviceFFHandle managed_handle{ -/*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true -} -; +// /*workSpaceSize=*/1024 * 1024, +// /*allowTensorOpMathConversion=*/true}; -// RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ -// DeviceSpecific::create(managed_handle.raw_handle()), -// EnableProfiling::YES, -// ProfilingSettings{/*warmup_iters=*/0, -// /*measure_iters=*/1}}; +// RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ +// DeviceSpecific::create(managed_handle.raw_handle()), +// EnableProfiling::YES, +// ProfilingSettings{/*warmup_iters=*/0, +// /*measure_iters=*/1}}; -// LocalCostEstimator cost_estimator = -// LocalCostEstimator{runtime_arg_config}; +// LocalCostEstimator cost_estimator = +// LocalCostEstimator{runtime_arg_config}; -// SUBCASE("Estimate cost -- Attention Op") { -// int embed_dim = 32; -// int num_heads = 10; -// MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ -// /*embed_dim=*/embed_dim, -// /*num_heads=*/num_heads, -// /*kdim=*/embed_dim, -// /*vdim=*/embed_dim, -// /*dropout=*/0.0, -// /*bias=*/true, -// /*add_bias_kv=*/false, -// /*add_zero_attn=*/false, -// }; +// SUBCASE("Estimate cost -- Attention Op") { +// int embed_dim = 32; +// int num_heads = 10; +// MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ +// /*embed_dim=*/embed_dim, +// /*num_heads=*/num_heads, +// /*kdim=*/embed_dim, +// /*vdim=*/embed_dim, +// /*dropout=*/0.0, +// /*bias=*/true, +// /*add_bias_kv=*/false, +// /*add_zero_attn=*/false, +// }; -// size_t batch_size = 40; -// size_t seq_len = 48; -// size_t feature_size = 36; +// size_t batch_size = 40; +// size_t seq_len = 48; +// size_t feature_size = 36; -// DataType dtype = DataType::FLOAT; -// ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ -// TensorDims{FFOrdered{batch_size, seq_len, feature_size}}, -// DataType::FLOAT, -// }); +// DataType dtype = DataType::FLOAT; +// ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ +// TensorDims{FFOrdered{batch_size, seq_len, +// feature_size}}, DataType::FLOAT, +// }); -// ParallelTensorShape weights_shape = throw_if_unexpected( -// get_weights_shape(attrs, inputs_shape, inputs_shape, -// inputs_shape)); -// ParallelTensorAttrs weight_attrs = -// ParallelTensorAttrs{weights_shape, -// /*sync_type=*/std::nullopt, -// /*initializer=*/std::nullopt, -// CreateGrad::YES}; +// ParallelTensorShape weights_shape = throw_if_unexpected( +// get_weights_shape(attrs, inputs_shape, inputs_shape, +// inputs_shape)); +// ParallelTensorAttrs weight_attrs = +// ParallelTensorAttrs{weights_shape, +// /*sync_type=*/std::nullopt, +// /*initializer=*/std::nullopt, +// CreateGrad::YES}; -// ParallelTensorShape output_shape = throw_if_unexpected( -// get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); -// ParallelTensorAttrs output_attrs = -// ParallelTensorAttrs{output_shape, -// /*sync_type=*/std::nullopt, -// /*initializer=*/std::nullopt, -// CreateGrad::YES}; +// ParallelTensorShape output_shape = throw_if_unexpected( +// get_output_shape(attrs, inputs_shape, inputs_shape, +// inputs_shape)); +// ParallelTensorAttrs output_attrs = +// ParallelTensorAttrs{output_shape, +// /*sync_type=*/std::nullopt, +// /*initializer=*/std::nullopt, +// CreateGrad::YES}; -// CostDetails result = cost_estimator.estimate_cost( -// PCGOperatorAttrs{attrs}, -// std::vector{ -// inputs_shape, inputs_shape, inputs_shape}, -// std::vector{weight_attrs}, -// std::vector{output_attrs}, -// make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1})); +// CostDetails result = cost_estimator.estimate_cost( +// PCGOperatorAttrs{attrs}, +// std::vector{ +// inputs_shape, inputs_shape, inputs_shape}, +// std::vector{weight_attrs}, +// std::vector{output_attrs}, +// make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1})); -// CHECK(result.total_elapsed_time > 0); -// CHECK(result.total_mem_usage > 0); +// CHECK(result.total_elapsed_time > 0); +// CHECK(result.total_mem_usage > 0); +// } +// } // } -// } -// } diff --git a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml index 2c524c120a..09ee99915d 100644 --- a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml +++ b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml @@ -13,5 +13,3 @@ name = "SUM" [[values]] name = "AVG" -[[values]] -name = "NONE" diff --git a/lib/op-attrs/include/op-attrs/make_datatype_value.h b/lib/op-attrs/include/op-attrs/make_datatype_value.h deleted file mode 100644 index af4792dd9e..0000000000 --- a/lib/op-attrs/include/op-attrs/make_datatype_value.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H -#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H - -#include "op-attrs/datatype_value.dtg.h" - -namespace FlexFlow { - -DataTypeValue make_float_data_type_value(float value); -DataTypeValue make_double_data_type_value(double value); -DataTypeValue make_int32_data_type_value(int32_t value); -DataTypeValue make_int64_data_type_value(int64_t value); -DataTypeValue make_bool_data_type_value(bool value); - -} // namespace FlexFlow - -#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H diff --git a/lib/op-attrs/src/op-attrs/make_datatype_value.cc b/lib/op-attrs/src/op-attrs/datatype_value.cc similarity index 92% rename from lib/op-attrs/src/op-attrs/make_datatype_value.cc rename to lib/op-attrs/src/op-attrs/datatype_value.cc index 76d712949a..4604ef0b4e 100644 --- a/lib/op-attrs/src/op-attrs/make_datatype_value.cc +++ b/lib/op-attrs/src/op-attrs/datatype_value.cc @@ -1,4 +1,4 @@ -#include "op-attrs/make_datatype_value.h" +#include "op-attrs/datatype_value.h" namespace FlexFlow { diff --git a/lib/op-attrs/test/src/op-attrs/datatype_value.cc b/lib/op-attrs/test/src/op-attrs/datatype_value.cc new file mode 100644 index 0000000000..9b0e90b601 --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/datatype_value.cc @@ -0,0 +1,68 @@ +#include "op-attrs/datatype_value.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("test make_data_type_value") { + SUBCASE("make_float_data_type_value") { + float value = 1.0f; + DataTypeValue data_type_value = make_float_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_double_data_type_value") { + double value = 2.71828; + DataTypeValue data_type_value = make_double_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_int32_data_type_value") { + int32_t value = -42; + DataTypeValue data_type_value = make_int32_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_int64_data_type_value") { + int64_t value = 1LL << 40; + DataTypeValue data_type_value = make_int64_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_bool_data_type_value") { + bool value = true; + DataTypeValue data_type_value = make_bool_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + } +} diff --git a/lib/pcg/include/pcg/metric.enum.toml b/lib/pcg/include/pcg/metric.enum.toml new file mode 100644 index 0000000000..ebb2323203 --- /dev/null +++ b/lib/pcg/include/pcg/metric.enum.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "Metric" +features = [ + "hash", + "json", + "rapidcheck", + "fmt", +] + +[[values]] +name = "ACCURACY" + +[[values]] +name = "CATEGORICAL_CROSSENTROPY" + +[[values]] +name = "SPARSE_CATEGORICAL_CROSSENTROPY" + +[[values]] +name = "MEAN_SQUARED_ERROR" + +[[values]] +name = "ROOT_MEAN_SQUARED_ERROR" + +[[values]] +name = "MEAN_ABSOLUTE_ERROR" diff --git a/lib/pcg/include/pcg/metric.h b/lib/pcg/include/pcg/metric.h deleted file mode 100644 index 718919112f..0000000000 --- a/lib/pcg/include/pcg/metric.h +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef _FF_METRICS_H_ -#define _FF_METRICS_H_ - -#include "op-attrs/ops/loss_functions/loss_functions.h" -#include "utils/fmt.h" -#include - -namespace FlexFlow { - -enum class Metric { - ACCURACY, - CATEGORICAL_CROSSENTROPY, - SPARSE_CATEGORICAL_CROSSENTROPY, - MEAN_SQUARED_ERROR, - ROOT_MEAN_SQUARED_ERROR, - MEAN_ABSOLUTE_ERROR, -}; - -class MetricsAttrs { -public: - MetricsAttrs() = delete; - MetricsAttrs(LossFunction, std::vector const &); - -public: - LossFunction loss_type; - bool measure_accuracy; - bool measure_categorical_crossentropy; - bool measure_sparse_categorical_crossentropy; - bool measure_mean_squared_error; - bool measure_root_mean_squared_error; - bool measure_mean_absolute_error; -}; - -} // namespace FlexFlow - -namespace fmt { - -template <> -struct formatter<::FlexFlow::Metric> : formatter { - template - auto format(::FlexFlow::Metric m, FormatContext &ctx) const - -> decltype(ctx.out()) { - using namespace FlexFlow; - - string_view name = "unknown"; - switch (m) { - case Metric::ACCURACY: - name = "Accuracy"; - break; - case Metric::CATEGORICAL_CROSSENTROPY: - name = "CategoricalCrossEntropy"; - break; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - name = "SparseCategoricalCrossEntropy"; - break; - case Metric::MEAN_SQUARED_ERROR: - name = "MeanSquaredError"; - break; - case Metric::ROOT_MEAN_SQUARED_ERROR: - name = "RootMeanSquaredError"; - break; - case Metric::MEAN_ABSOLUTE_ERROR: - name = "MeanAbsoluteError"; - break; - } - return formatter::format(name, ctx); - } -}; - -} // namespace fmt - -#endif diff --git a/lib/pcg/include/pcg/metric_attrs.h b/lib/pcg/include/pcg/metric_attrs.h new file mode 100644 index 0000000000..343c2154dd --- /dev/null +++ b/lib/pcg/include/pcg/metric_attrs.h @@ -0,0 +1,28 @@ +#ifndef _FF_METRICS_H_ +#define _FF_METRICS_H_ + +#include "op-attrs/ops/loss_functions/loss_functions.h" +#include "pcg/metric.dtg.h" +#include "utils/fmt.h" +#include + +namespace FlexFlow { + +class MetricsAttrs { +public: + MetricsAttrs() = delete; + MetricsAttrs(LossFunction, std::unordered_set const &); + +public: + LossFunction loss_type; + bool measure_accuracy; + bool measure_categorical_crossentropy; + bool measure_sparse_categorical_crossentropy; + bool measure_mean_squared_error; + bool measure_root_mean_squared_error; + bool measure_mean_absolute_error; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/pcg/include/pcg/strided_rectangle.h b/lib/pcg/include/pcg/strided_rectangle.h deleted file mode 100644 index 9c3b8eeda9..0000000000 --- a/lib/pcg/include/pcg/strided_rectangle.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef _FLEXFLOW_PCG_INCLUDE_PCG_STRIDED_RECTANGLE_H -#define _FLEXFLOW_PCG_INCLUDE_PCG_STRIDED_RECTANGLE_H - -#include "op-attrs/ff_dim.dtg.h" -#include "pcg/side_size_t.dtg.h" -#include "pcg/strided_rectangle.dtg.h" - -namespace FlexFlow { - -size_t get_num_dims(StridedRectangle const &); -StridedRectangleSide get_side_at_idx(StridedRectangle const &rect, - ff_dim_t const &idx); -num_points_t get_num_points(StridedRectangle const &rect); - -} // namespace FlexFlow - -#endif diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc index 7ff5bec2f7..09772fa9d9 100644 --- a/lib/pcg/src/pcg/computation_graph_builder.cc +++ b/lib/pcg/src/pcg/computation_graph_builder.cc @@ -1,9 +1,9 @@ #include "pcg/computation_graph_builder.h" #include "op-attrs/computation_graph_op_attrs.h" +#include "op-attrs/datatype_value.h" #include "op-attrs/get_incoming_tensor_roles.h" #include "op-attrs/get_op_type.h" #include "op-attrs/get_output_shapes.h" -#include "op-attrs/make_datatype_value.h" #include "op-attrs/ops/attention.h" #include "op-attrs/ops/batch_norm.h" #include "op-attrs/ops/broadcast.h" diff --git a/lib/pcg/src/pcg/metric.cc b/lib/pcg/src/pcg/metric.cc index 69aba90d12..9a93e75350 100644 --- a/lib/pcg/src/pcg/metric.cc +++ b/lib/pcg/src/pcg/metric.cc @@ -1,8 +1,8 @@ -#include "pcg/metric.h" +#include "pcg/metric_attrs.h" namespace FlexFlow { MetricsAttrs::MetricsAttrs(LossFunction _loss_type, - std::vector const &metrics) + std::unordered_set const &metrics) : loss_type(_loss_type), measure_accuracy(false), measure_categorical_crossentropy(false), measure_sparse_categorical_crossentropy(false), @@ -29,8 +29,8 @@ MetricsAttrs::MetricsAttrs(LossFunction _loss_type, measure_mean_absolute_error = true; continue; default: - throw mk_runtime_error( - "Initializing MetricsAttrs with unrecogonized metrics type"); + throw mk_runtime_error(fmt::format( + "Initializing MetricsAttrs with unrecogonized metrics type {}", m)); } } } diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc index 79ac43ae66..e2f4555328 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc @@ -1,6 +1,6 @@ #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" +#include "op-attrs/datatype_value.h" #include "op-attrs/get_incoming_tensor_roles.h" -#include "op-attrs/make_datatype_value.h" #include "op-attrs/ops/attention.h" #include "op-attrs/ops/batch_matmul.h" #include "op-attrs/ops/batch_norm.h" diff --git a/lib/pcg/src/pcg/strided_rectangle_side.cc b/lib/pcg/src/pcg/strided_rectangle_side.cc deleted file mode 100644 index e6caf4cb86..0000000000 --- a/lib/pcg/src/pcg/strided_rectangle_side.cc +++ /dev/null @@ -1,17 +0,0 @@ -#include "pcg/strided_rectangle_side.h" -#include "utils/exception.h" - -namespace FlexFlow { - -StridedRectangleSide strided_side_from_size_and_stride(side_size_t side_size, - int stride) { - assert((side_size.unwrapped % stride) == 0); - return StridedRectangleSide{num_points_t{side_size.unwrapped / stride}, - stride}; -} - -side_size_t get_side_size(StridedRectangleSide const &s) { - return side_size_t{s.num_points.unwrapped * s.stride}; -} - -} // namespace FlexFlow diff --git a/lib/pcg/src/strided_rectangle.cc b/lib/pcg/src/strided_rectangle.cc deleted file mode 100644 index 1c61424ab9..0000000000 --- a/lib/pcg/src/strided_rectangle.cc +++ /dev/null @@ -1,35 +0,0 @@ -#include "pcg/strided_rectangle.h" -#include "op-attrs/dim_ordered/transform.h" -#include "utils/containers.h" - -namespace FlexFlow { - -/* size_t StridedRectangle::at(FFOrdered const &coord) const { */ -/* assert(coord.size() == this->num_dims()); */ - -/* size_t _1d_stride = 1; */ -/* size_t idx = 0; */ -/* for (auto dim : inner_to_outer_idxs(this->sides)) { */ -/* idx += this->sides.at(dim).at(coord.at(dim)).value() * _1d_stride; */ -/* _1d_stride *= this->sides.at(dim).get_size().value(); */ -/* } */ -/* return idx; */ -/* } */ - -size_t get_num_dims(StridedRectangle const &rect) { - return rect.sides.size(); -} - -num_points_t get_num_points(StridedRectangle const &rect) { - return num_points_t{ - product(transform(rect.sides, [](StridedRectangleSide const &side) { - return side.num_points.unwrapped; - }))}; -} - -StridedRectangleSide get_side_at_idx(StridedRectangle const &rect, - ff_dim_t const &idx) { - return rect.sides.at(idx); -} - -} // namespace FlexFlow diff --git a/lib/pcg/test/src/test_machine_view.cc b/lib/pcg/test/src/test_machine_view.cc deleted file mode 100644 index 92a96d5e9a..0000000000 --- a/lib/pcg/test/src/test_machine_view.cc +++ /dev/null @@ -1,74 +0,0 @@ -#include "doctest/doctest.h" -#include "pcg/machine_view.h" -#include "pcg/strided_rectangle.h" -#include "pcg/strided_rectangle_side.h" - -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("MachineView general util functions") { - StridedRectangle rect{{StridedRectangleSide{num_points_t{7}, 5}, - StridedRectangleSide{num_points_t{10}, 2}}}; - gpu_id_t start(1); - MachineView mv{device_id_t{start}, rect}; - SUBCASE("num_dims") { - CHECK(num_dims(mv) == 2); - } - SUBCASE("num_devices") { - CHECK(num_devices(mv) == 7 * 10); - } - SUBCASE("get_device_type") { - CHECK(get_device_type(mv) == DeviceType::GPU); - } - } - - TEST_CASE("MachineView make_1d_machine_view - GPU") { - StridedRectangle rect{{StridedRectangleSide{num_points_t{7}, 5}}}; - device_id_t start_gpu{gpu_id_t{1}}; - MachineView gpu_mv{start_gpu, rect}; - - SUBCASE("make_1d_machine_view(gpu_id_t start, gpu_id_t stop, int stride)") { - MachineView result = - make_1d_machine_view(start_gpu, device_id_t{gpu_id_t(1 + 7 * 5)}, 5); - MachineView correct = gpu_mv; - CHECK(result == correct); - } - SUBCASE("make_1d_machine_view(gpu_id_t start, num_points_t num_points, int " - "stride)") { - MachineView result = make_1d_machine_view(start_gpu, num_points_t{7}, 5); - MachineView correct = gpu_mv; - CHECK(result == correct); - } - SUBCASE("make_1d_machine_view(gpu_id_t start, side_size_t interval_size, " - "int stride)") { - MachineView result = make_1d_machine_view( - start_gpu, get_side_size(rect.sides.at(ff_dim_t{0})), 5); - MachineView correct = gpu_mv; - CHECK(result == correct); - } - } - - TEST_CASE("MachineView make_1d_machine_view - CPU") { - StridedRectangle rect{{StridedRectangleSide{num_points_t{11}, 4}}}; - device_id_t start_cpu{cpu_id_t{2}}; - MachineView cpu_mv{start_cpu, rect}; - - SUBCASE("make_1d_machine_view(cpu_id_t start, cpu_id_t stop, int stride)") { - MachineView result = - make_1d_machine_view(start_cpu, device_id_t{cpu_id_t(2 + 11 * 4)}, 4); - MachineView correct = cpu_mv; - CHECK(result == correct); - } - SUBCASE("make_1d_machine_view(cpu_id_t start, num_points_t num_points, int " - "stride)") { - MachineView result = make_1d_machine_view(start_cpu, num_points_t{11}, 4); - MachineView correct = cpu_mv; - CHECK(result == correct); - } - SUBCASE("make_1d_machine_view(cpu_id_t start, side_size_t interval_size, " - "int stride)") { - MachineView result = make_1d_machine_view( - start_cpu, get_side_size(rect.sides.at(ff_dim_t{0})), 4); - MachineView correct = cpu_mv; - CHECK(result == correct); - } - } -} diff --git a/lib/pcg/test/src/test_strided_rectangle.cc b/lib/pcg/test/src/test_strided_rectangle.cc deleted file mode 100644 index ef342944de..0000000000 --- a/lib/pcg/test/src/test_strided_rectangle.cc +++ /dev/null @@ -1,37 +0,0 @@ -#include "doctest/doctest.h" -#include "pcg/strided_rectangle.h" -#include "pcg/strided_rectangle_side.h" - -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("get_side_size(StridedRectangleSide)") { - StridedRectangleSide side{num_points_t{7}, 5}; - - CHECK(get_side_size(side) == side_size_t{7 * 5}); - } - TEST_CASE("strided_side_from_size_and_stride") { - StridedRectangleSide correct{num_points_t{10}, 3}; - StridedRectangleSide result = - strided_side_from_size_and_stride(side_size_t{10 * 3}, 3); - CHECK(result == correct); - } - - TEST_CASE("StridedRectangle - helper functions") { - - StridedRectangleSide s0{num_points_t{7}, 5}; - StridedRectangleSide s1{num_points_t{10}, 2}; - StridedRectangleSide s2{num_points_t{8}, 1}; - StridedRectangle rect{{s0, s1, s2}}; - - SUBCASE("get_num_dims") { - CHECK(get_num_dims(rect) == 3); - } - SUBCASE("get_num_points") { - CHECK(get_num_points(rect) == num_points_t{7 * 8 * 10}); - } - SUBCASE("get_side_at_idx") { - CHECK(get_side_at_idx(rect, ff_dim_t{0}) == s0); - CHECK(get_side_at_idx(rect, ff_dim_t{1}) == s1); - CHECK(get_side_at_idx(rect, ff_dim_t{2}) == s2); - } - } -} From 8860adfc61a17a5bcb23075f90c0661d74589d07 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 28 Jan 2025 18:45:06 -0800 Subject: [PATCH 22/42] build issues --- .envrc | 3 --- .vimrc | 8 -------- lib/kernels/test/src/test_utils.cc | 3 +-- 3 files changed, 1 insertion(+), 13 deletions(-) delete mode 100644 .envrc delete mode 100644 .vimrc diff --git a/.envrc b/.envrc deleted file mode 100644 index 2797f0f929..0000000000 --- a/.envrc +++ /dev/null @@ -1,3 +0,0 @@ -source_up_if_exists - -use flake diff --git a/.vimrc b/.vimrc deleted file mode 100644 index 4c8a8a8279..0000000000 --- a/.vimrc +++ /dev/null @@ -1,8 +0,0 @@ -" example search path configuration -set path=lib/runtime/**,lib/** - -" set build target -" let g:target = "pcg" - -" set test target -" let g:test_target = "utils-test" diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index c75abd50ff..a15447446a 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -3,7 +3,7 @@ #include "utils/join_strings.h" #include -namespace FlexFlow { +using namespace ::FlexFlow; GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, Allocator &allocator) { @@ -242,4 +242,3 @@ GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, create_filled_accessor_w(shape, allocator, val); return read_only_accessor_from_write_accessor(w_accessor); } -} // namespace FlexFlow From 7b74acc66b00b9e3380cab3598345660ceb8d5a1 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Wed, 29 Jan 2025 19:39:32 -0800 Subject: [PATCH 23/42] Add AWS linux AMI to runs-on for testing (#1589) --- .github/runs-on.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/runs-on.yml b/.github/runs-on.yml index 14f75549dd..b558b5131a 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -1,4 +1,10 @@ images: + amazon-linux-gpu-x64: + platform: "linux" + arch: "x64" + owner: "898082745236" # AWS + name: "Amazon Linux 2 AMI with NVIDIA TESLA GPU Driver*" + dlami-x64: platform: "linux" arch: "x64" @@ -8,4 +14,4 @@ images: runners: gpu-nvidia: family: ["g4dn.xlarge"] - image: dlami-x64 + image: amazon-linux-gpu-x64 From 8cdc677f2fbaa85d55577c846ed0e644ab47e272 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Thu, 30 Jan 2025 13:57:39 -0800 Subject: [PATCH 24/42] Pin runs-on images (#1590) --- .github/runs-on.yml | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/.github/runs-on.yml b/.github/runs-on.yml index b558b5131a..6312b64955 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -1,17 +1,12 @@ images: - amazon-linux-gpu-x64: + runs-on-gpu-pinned: platform: "linux" arch: "x64" - owner: "898082745236" # AWS - name: "Amazon Linux 2 AMI with NVIDIA TESLA GPU Driver*" + owner: "135269210855" # runs-on + name: "runs-on-v2.2-ubuntu22-gpu-x64-20250123194414" - dlami-x64: + runs-on-cpu-pinned: platform: "linux" arch: "x64" - owner: "898082745236" # AWS - name: "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" - -runners: - gpu-nvidia: - family: ["g4dn.xlarge"] - image: amazon-linux-gpu-x64 + owner: "135269210855" # runs-on + name: "runs-on-v2.2-ubuntu22-full-x64-20250101080516" From 209db7ee4434ceb1a2bc700a583bd35d2039aa30 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 31 Jan 2025 00:20:51 -0800 Subject: [PATCH 25/42] GPU CI Fix (Pin runs-on GPU image) (#1588) * Debug * Change to base DL AMI * Print disk usage * Run nvidia-smi * Remove excess cuda installs in base ami * Re-enable freeing space in GPU CI * Try updating nix-develop version * Check what happens if you just enter the non-nixGL environment * Try switching AMIs * Try to remove the module stuff * Move to lockshaw/develop-action * Try pointing at a fixed commit * Update nix-develop action * Update nix-develop action to use BASH_FUNC filtering * Remove all the /usr/local/cuda entries * Switch back to gpu-ci env * Update the cuda arch * Try out the new runs-on gpu image * Move over to pinned runs-on image * Remove a bunch more unnecessary stuff in image to get back disk space * Try using an emphemeral store * Try mounting * Fix bug * Try sudo * Move nix into _work * Rollback all unnecessary changes * Re-enable waiting on cpu-ci --- .github/workflows/helpers/free_space_on_runner_gpu.sh | 8 -------- .github/workflows/tests.yml | 9 +++++---- 2 files changed, 5 insertions(+), 12 deletions(-) delete mode 100755 .github/workflows/helpers/free_space_on_runner_gpu.sh diff --git a/.github/workflows/helpers/free_space_on_runner_gpu.sh b/.github/workflows/helpers/free_space_on_runner_gpu.sh deleted file mode 100755 index a382ee58f6..0000000000 --- a/.github/workflows/helpers/free_space_on_runner_gpu.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -set -euo pipefail -set -x - -sudo rm -rf /usr/share/dotnet -sudo rm -rf /usr/local/lib/android -sudo rm -rf /opt/ghc -sudo rm -rf "/usr/local/share/boost" diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7e2dabd784..e2fc0b6df6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,7 +16,7 @@ jobs: submodules: recursive - name: Free additional space on runner - run: ./.github/workflows/helpers/free_space_on_runner_gpu.sh + run: ./.github/workflows/helpers/free_space_on_runner.sh - name: Install nix uses: cachix/install-nix-action@v25 @@ -67,7 +67,7 @@ jobs: runs-on: - runs-on - family=g4dn.xlarge - - image=ubuntu22-full-x64 + - image=runs-on-gpu-pinned strategy: max-parallel: 1 @@ -79,8 +79,9 @@ jobs: with: submodules: recursive - - name: free additional space on runner - run: ./.github/workflows/helpers/free_space_on_runner_gpu.sh + - name: mount ephemeral drive to nix + run: | + sudo mkdir $HOME/_work/nix && sudo mkdir /nix && sudo mount --bind $HOME/_work/nix /nix - name: install nix uses: cachix/install-nix-action@v25 From 0d2ffdb278a6e0a204e94083d4dfaec5db249200 Mon Sep 17 00:00:00 2001 From: Victor Li <32348970+victorli2002@users.noreply.github.com> Date: Sat, 1 Feb 2025 12:54:42 -0800 Subject: [PATCH 26/42] Merge substitution-builder (#1575) * Start on pcg builder * Add tests and some implementation for pcg builder * Add pcg tests, make dtgen constructors explicit to fix bug * Add remainder of PCG tests * Fix build issues in local-execution * Format * Address Reyna comments, add topological_order function for PCG * Pre multidigraph refactor * Removing visitable from sp code * Add open dataflow graph, start to replace pcg dataflow graph * Start refactoring substitutions * Add utility functions to support pattern matching * Pre-refactor inputs * Fix proj url * Get back to substitutions, now with unordered graph inputs * Get substitutions building * substitutions-tests now builds * Fix bug in filter, pass some initial substitution tests * Add tests for fmt::to_string, fix some substitutions bugs * Pass initial unit tests for find_pattern_matches * Start on unit tests for pcg pattern * Pass initial test for find_pattern_matches * Fix small build issue in tests * Format * Sync tests in CI with tests in proj * Fix minor build errors in kernels and local-execution * Format * Remove outdated code * More outdated code removal * More cleanup, add test for sp decomposition * Pull apart containers.h * More sp testing and fixes * Break up graph algorithms.h * Pre- full SP algo commit * Add initial implementation and tests for cbc decomposition and inverse line graph * Pass test for get_inverse_line_graph * Add new multidigraph * Fix get_inverse_line_graph to return a MultiDiGraph instead of a DiGraph * Add tests for parallel and series reduction finding * Add really rough implementation of valdez sp decomposition * Fix local-execution build * Add implementations and tests for applying series/parallel reductions * Format * Clean up sp decomposition interface and tests * Format * Add comments for top-level substitutions functions, add proj doxygen support * Start sketching out substitutions code * Fix build errors * Add ability to permute node ids * Cleanup and start to test new substitutions code * Add test case for evaluate_substitution_output * Add naive isomorphism detection code * Add graph inputs to open dataflow graph isomorphism * Add input permutation to evaluate_substitution_output * Fix permute_node_ids * Add test for permute_input_ids * Migrate over to mutable implementation of apply_substitution * Add fast isomorphism checking and an initial implementation of full substitution logic * Pass initial full substitutions test * Cleanup old isomorphism checking code * Fix post-merge bugs * Fix broken pcg builder test * Format * Reorganize code and remove some outdated code pre-code-review * Format * Restarting work on this after working on export-model-arch * Adding in some a simple function to get the currently available substritutions * nonnegative_int additions, code cleanup, etc. * A bunch more moving over to nonnegative_int * Even more nonnegative_int updating * Fix build * Fix failing tests * Format * Format --------- Co-authored-by: Colin Unger Co-authored-by: Victor Li --- .../src/export_model_arch.cc | 17 +- cmake/flexflow-utils.cmake | 14 +- flake.nix | 14 +- ...omputation_graph_binary_sp_decomposition.h | 5 +- .../src/compiler/allowed_machine_views.cc | 41 +- .../get_machine_resource_splits.cc | 10 +- .../machine_mapping/machine_mapping.cc | 10 +- ...el_layer_guid_oblivious_machine_mapping.cc | 4 +- ...mputation_graph_binary_sp_decomposition.cc | 2 +- .../test/src/allowed_machine_views.cc | 60 +- ...racted_tensor_set_movement_across_split.cc | 8 +- .../get_machine_resource_splits.cc | 193 ++--- .../get_optimal_machine_mapping.cc | 28 +- .../get_tensor_set_movement_across_split.cc | 32 +- .../machine_mapping/machine_mapping.cc | 24 +- .../get_machine_mapping_problem_tree.cc | 6 +- .../machine_mapping/machine_mapping_result.cc | 36 +- ...get_optimal_machine_mapping_with_memory.cc | 28 +- .../machine_mapping_result_with_memory.cc | 80 +-- ...ion_graph_series_parallel_decomposition.cc | 64 +- .../task_graph_simulator/task_simulator.cc | 88 ++- lib/compiler/test/src/graph_optimize_state.cc | 59 +- lib/kernels/include/kernels/array_shape.h | 36 +- .../include/kernels/batch_norm_kernels.h | 43 +- .../batch_norm_per_device_state.struct.toml | 68 ++ lib/kernels/include/kernels/legion_dim.h | 10 +- .../include/kernels/legion_dim_t.struct.toml | 7 +- .../kernels/per_device_op_state.variant.toml | 5 - .../include/kernels/transpose_kernels.h | 17 +- lib/kernels/src/allocation.cc | 3 +- lib/kernels/src/array_shape.cc | 53 +- lib/kernels/src/cuda/cuda_helper.cu | 8 +- .../src/cuda/ops/batch_norm_kernels.cu | 32 +- lib/kernels/src/cuda/ops/cast_kernels.cu | 4 +- lib/kernels/src/cuda/ops/combine_kernels.cu | 5 +- lib/kernels/src/cuda/ops/concat_kernels.cu | 7 +- lib/kernels/src/cuda/ops/conv_2d_kernels.cu | 16 +- .../src/cuda/ops/element_unary_kernels.cu | 8 +- lib/kernels/src/cuda/ops/flat_kernels.cu | 12 +- lib/kernels/src/cuda/ops/gather_kernels.cu | 54 +- lib/kernels/src/cuda/ops/partition_kernels.cu | 16 +- lib/kernels/src/cuda/ops/reduction_kernels.cu | 8 +- lib/kernels/src/cuda/ops/replicate_kernels.cu | 8 +- lib/kernels/src/cuda/ops/reshape_kernels.cu | 7 +- lib/kernels/src/cuda/ops/transpose_kernels.cu | 99 +-- lib/kernels/src/legion_dim.cc | 9 +- lib/kernels/test/src/test_attention_kernel.cc | 50 +- .../test/src/test_batch_matmul_kernel.cc | 30 +- .../test/src/test_batch_norm_kernel.cc | 58 +- lib/kernels/test/src/test_cast_kernel.cc | 4 +- lib/kernels/test/src/test_combine_kernel.cc | 2 +- lib/kernels/test/src/test_concat_kernel.cc | 8 +- lib/kernels/test/src/test_dropout.cc | 4 +- lib/kernels/test/src/test_flat_kernel.cc | 2 +- lib/kernels/test/src/test_gather_kernels.cc | 7 +- .../test/src/test_layer_norm_kernels.cc | 8 +- lib/kernels/test/src/test_partition_kernel.cc | 2 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 52 +- lib/kernels/test/src/test_reduction_kernel.cc | 5 +- lib/kernels/test/src/test_replicate_kernel.cc | 12 +- lib/kernels/test/src/test_reshape_kernel.cc | 2 +- lib/kernels/test/src/test_reverse_kernels.cc | 51 +- lib/kernels/test/src/test_softmax_kernel.cc | 19 +- lib/kernels/test/src/test_split_kernel.cc | 12 +- lib/kernels/test/src/test_transpose_kernel.cc | 19 +- ...device_specific_device_states.variant.toml | 6 +- .../local-execution/legion_tensor_shape.h | 40 -- .../local-execution/task_id_t.enum.toml | 3 - .../src/legion_tensor_shape.cc | 15 - lib/local-execution/src/ops/attention.cc | 55 +- lib/local-execution/src/ops/batch_matmul.cc | 73 +- lib/local-execution/src/ops/batch_matmul.h | 2 +- lib/local-execution/src/ops/batch_norm.cc | 27 +- lib/local-execution/src/ops/conv_2d.cc | 26 +- lib/local-execution/src/ops/gather.cc | 9 +- lib/local-execution/src/ops/layer_norm.cc | 24 +- lib/local-execution/src/ops/linear.cc | 49 +- lib/local-execution/src/ops/pool_2d.cc | 91 ++- lib/local-execution/src/ops/reduce.cc | 9 +- lib/local-execution/src/ops/reduction.cc | 4 +- lib/local-execution/src/ops/replicate.cc | 4 +- lib/local-execution/src/ops/reverse.cc | 46 +- lib/local-execution/src/ops/softmax.cc | 20 +- lib/local-execution/src/ops/split.cc | 51 +- lib/local-execution/src/ops/topk.cc | 20 +- lib/local-execution/src/ops/transpose.cc | 55 +- lib/local-execution/src/ops/transpose.h | 3 - .../src/task_signature_impl.cc | 4 - .../test/src/test_local_slots_backing.cc | 13 +- .../test/src/test_local_task_arg_accessor.cc | 13 +- .../test/src/test_task_registry.cc | 8 +- .../models/bert/bert_config.struct.toml | 15 +- .../candle_uno/candle_uno_config.struct.toml | 9 +- .../inception_v3_config.struct.toml | 8 +- .../include/models/split_test/split_test.h | 2 +- .../transformer_config.struct.toml | 21 +- lib/models/src/models/bert/bert.cc | 41 +- .../src/models/candle_uno/candle_uno.cc | 57 +- .../src/models/inception_v3/inception_v3.cc | 675 +++++++++--------- .../src/models/split_test/split_test.cc | 16 +- .../src/models/transformer/transformer.cc | 89 +-- .../computation_graph_op_attrs.variant.toml | 2 +- lib/op-attrs/include/op-attrs/datatype.h | 3 +- .../op-attrs/dim_ordered/dim_ordered.h | 27 +- .../include/op-attrs/dim_ordered/slice.h | 4 +- lib/op-attrs/include/op-attrs/get_op_type.h | 2 +- lib/op-attrs/include/op-attrs/ops/attention.h | 36 +- .../multihead_attention_inputs.struct.toml | 12 +- .../op-attrs/ops/attention_attrs.struct.toml | 12 +- .../include/op-attrs/ops/batch_matmul.h | 2 +- .../op-attrs/ops/batch_matmul.struct.toml | 19 - .../ops/batch_matmul_attrs.struct.toml | 30 + .../op-attrs/ops/combine_attrs.struct.toml | 3 +- .../conv_2d/conv_2d_input_shape.struct.toml | 9 +- .../conv_2d_parallel_input_shape.struct.toml | 5 +- .../op-attrs/ops/conv_2d_attrs.struct.toml | 17 +- .../op-attrs/ops/embedding_attrs.struct.toml | 7 +- .../op-attrs/ops/linear_attrs.struct.toml | 3 +- lib/op-attrs/include/op-attrs/ops/pool_2d.h | 4 +- .../op-attrs/ops/pool_2d_attrs.struct.toml | 13 +- .../op-attrs/ops/reduction_attrs.struct.toml | 6 +- .../ops/repartition_attrs.struct.toml | 3 +- .../op-attrs/ops/replicate_attrs.struct.toml | 6 +- .../op-attrs/ops/split_attrs.struct.toml | 3 +- .../op-attrs/ops/topk_attrs.struct.toml | 6 +- .../parallel_tensor_dim_degrees.struct.toml | 3 +- .../include/op-attrs/parallel_tensor_dims.h | 20 +- .../include/op-attrs/parallel_tensor_shape.h | 25 +- .../discard_copy_degree.struct.toml | 6 +- .../sum_degree.struct.toml | 6 +- .../op-attrs/pcg_operator_attrs.variant.toml | 2 +- .../include/op-attrs/relative_ff_dim_t.h | 2 +- .../op-attrs/replica_parallel_dim.struct.toml | 3 +- .../op-attrs/replica_parallel_dim_set.h | 3 +- .../op-attrs/shard_parallel_dim.struct.toml | 8 +- lib/op-attrs/include/op-attrs/tensor_dims.h | 8 +- .../include/op-attrs/tensor_dims.struct.toml | 4 +- lib/op-attrs/include/op-attrs/tensor_shape.h | 10 +- lib/op-attrs/src/op-attrs/datatype.cc | 15 +- lib/op-attrs/src/op-attrs/ff_dim_t.cc | 2 +- lib/op-attrs/src/op-attrs/ops/attention.cc | 474 ++---------- .../attention/multihead_attention_inputs.cc | 18 +- .../multihead_attention_parallel_inputs.cc | 6 +- lib/op-attrs/src/op-attrs/ops/batch_matmul.cc | 21 +- lib/op-attrs/src/op-attrs/ops/batch_norm.cc | 27 +- lib/op-attrs/src/op-attrs/ops/concat.cc | 15 +- lib/op-attrs/src/op-attrs/ops/conv_2d.cc | 65 +- .../ops/conv_2d/conv_2d_input_shape.cc | 8 +- lib/op-attrs/src/op-attrs/ops/embedding.cc | 24 +- lib/op-attrs/src/op-attrs/ops/flat.cc | 18 +- lib/op-attrs/src/op-attrs/ops/layer_norm.cc | 6 +- lib/op-attrs/src/op-attrs/ops/linear.cc | 20 +- lib/op-attrs/src/op-attrs/ops/pool_2d.cc | 73 +- .../src/op-attrs/parallel_tensor_dims.cc | 39 +- .../src/op-attrs/parallel_tensor_shape.cc | 64 +- .../src/op-attrs/relative_ff_dim_t.cc | 4 +- .../src/op-attrs/replica_parallel_dim_set.cc | 6 +- lib/op-attrs/src/op-attrs/tensor_dims.cc | 15 +- lib/op-attrs/src/op-attrs/tensor_shape.cc | 13 +- .../test/src/op-attrs/ops/attention.cc | 178 +++-- .../test/src/op-attrs/ops/batch_matmul.cc | 144 ++-- .../test/src/op-attrs/ops/batch_norm.cc | 84 +-- lib/op-attrs/test/src/op-attrs/ops/cast.cc | 34 +- lib/op-attrs/test/src/op-attrs/ops/combine.cc | 20 +- lib/op-attrs/test/src/op-attrs/ops/concat.cc | 176 ++--- lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc | 168 ++--- lib/op-attrs/test/src/op-attrs/ops/dropout.cc | 62 +- .../test/src/op-attrs/ops/element_binary.cc | 70 +- .../test/src/op-attrs/ops/element_unary.cc | 38 +- .../test/src/op-attrs/ops/embedding.cc | 68 +- lib/op-attrs/test/src/op-attrs/ops/flat.cc | 110 +-- .../test/src/op-attrs/ops/layer_norm.cc | 93 +-- lib/op-attrs/test/src/op-attrs/ops/linear.cc | 142 ++-- lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc | 226 +++--- .../test/src/op-attrs/ops/reduction.cc | 16 +- .../test/src/op-attrs/ops/repartition.cc | 16 +- .../test/src/op-attrs/ops/replicate.cc | 17 +- lib/op-attrs/test/src/op-attrs/ops/softmax.cc | 78 +- .../test/src/op-attrs/pcg_operator_attrs.cc | 4 +- .../test/src/op-attrs/relative_ff_dim_t.cc | 10 +- lib/op-attrs/test/src/op-attrs/tensor_dims.cc | 31 +- .../include/pcg/computation_graph_builder.h | 91 +-- lib/pcg/include/pcg/cpu_id_t.struct.toml | 6 +- lib/pcg/include/pcg/device_id.h | 4 +- .../file_format/v1/graphs/v1_dataflow_graph.h | 2 +- .../v1/graphs/v1_dataflow_graph.struct.toml | 3 +- .../v1/graphs/v1_graph_edge.struct.toml | 12 +- .../v1/graphs/v1_labelled_dataflow_graph.h | 9 +- .../v1_labelled_dataflow_graph.struct.toml | 5 +- .../v1_binary_sp_decomposition.variant.toml | 3 +- .../pcg/file_format/v1/v1_computation_graph.h | 2 +- lib/pcg/include/pcg/gpu_id_t.struct.toml | 6 +- .../pcg/machine_space_coordinate.struct.toml | 5 +- lib/pcg/include/pcg/machine_specification.h | 12 +- .../pcg/machine_specification.struct.toml | 10 +- lib/pcg/include/pcg/machine_view.h | 2 +- lib/pcg/include/pcg/operator_task_space.h | 4 +- .../pcg/operator_task_space.struct.toml | 3 +- .../parallel_computation_graph_builder.h | 38 +- .../parallel_computation_graph_edge.h | 2 +- .../pcg/start_invariant_machine_view.h | 2 +- lib/pcg/include/pcg/stride_t.struct.toml | 6 +- .../pcg/task_space_coordinate.struct.toml | 3 +- lib/pcg/src/pcg/computation_graph_builder.cc | 137 ++-- lib/pcg/src/pcg/device_id.cc | 4 +- .../v1/graphs/v1_dataflow_graph.cc | 6 +- .../v1/graphs/v1_labelled_dataflow_graph.cc | 16 + .../v1/v1_binary_sp_decomposition/json.cc | 4 +- .../file_format/v1/v1_computation_graph.cc | 7 +- lib/pcg/src/pcg/machine_space_offset.cc | 6 +- lib/pcg/src/pcg/machine_specification.cc | 17 +- lib/pcg/src/pcg/machine_view.cc | 79 +- lib/pcg/src/pcg/operator_task_space.cc | 22 +- .../generate_weight_transform.cc | 4 +- .../parallel_computation_graph_builder.cc | 98 +-- .../parallel_computation_graph_edge.cc | 2 +- .../src/pcg/start_invariant_machine_view.cc | 7 +- lib/pcg/test/src/pcg/computation_graph.cc | 40 +- .../test/src/pcg/computation_graph_builder.cc | 18 +- .../v1/v1_binary_sp_decomposition/json.cc | 18 +- .../file_format/v1/v1_computation_graph.cc | 8 +- .../v1/v1_parallel_computation_graph.cc | 10 +- lib/pcg/test/src/pcg/machine_specification.cc | 17 +- lib/pcg/test/src/pcg/machine_view.cc | 160 ++--- lib/pcg/test/src/pcg/operator_task_space.cc | 28 +- .../parallel_computation_graph.cc | 38 +- .../parallel_computation_graph_builder.cc | 155 ++-- .../src/pcg/start_invariant_machine_view.cc | 56 +- .../apply_substitution/apply_substitution.h | 31 + .../evaluate_substitution_output.h | 6 +- .../output_expr_to_result_sub_pcg_mapping.h | 6 +- ...expr_to_result_sub_pcg_mapping.struct.toml | 0 .../perform_shape_inference.h | 4 +- .../substitutions/constraint_type.enum.toml | 3 + .../operator_pattern/get_attribute_map.h | 15 + .../operator_attribute_constraint.h | 2 + .../operator_attribute_key.enum.toml | 1 + .../operator_pattern/operator_attribute_key.h | 12 + ...operator_attribute_list_access.struct.toml | 5 +- .../operator_attribute_value.variant.toml | 14 +- .../output_graph/output_graph_expr.h | 5 + .../output_graph/output_graph_expr_value.h | 16 + .../output_graph_expr_value.variant.toml | 19 + .../output_operator_attrs_assignment.h | 3 + ...tput_operator_attrs_assignment.struct.toml | 7 +- .../include/substitutions/pcg_pattern.h | 2 + .../include/substitutions/pcg_pattern_match.h | 4 +- .../sub_parallel_computation_graph_edge.h | 2 +- .../include/substitutions/substitution.h | 25 +- .../substitutions/substitution_builder.h | 49 ++ .../tensor_attribute_list_access.struct.toml | 5 +- .../tensor_pattern/tensor_attribute_pattern.h | 3 + .../tensor_attribute_value.variant.toml | 5 +- .../substitutions/unity_substitution_set.h | 47 ++ .../unlabelled/input_pattern_edge.h | 2 +- .../unlabelled/pattern_matching.h | 10 +- .../unlabelled/pattern_node_output.h | 2 +- .../unlabelled/standard_pattern_edge.h | 4 +- .../apply_substitution/apply_substitution.cc | 165 +++++ .../evaluate_substitution_output.cc | 4 +- .../output_expr_to_result_sub_pcg_mapping.cc | 2 +- .../perform_shape_inference.cc | 2 +- .../operator_pattern/eval_list_access.cc | 21 +- .../operator_pattern/eval_list_size.cc | 5 +- .../operator_pattern/get_attribute.cc | 156 ++-- .../operator_pattern/get_attribute_map.cc | 25 + .../operator_attribute_constraint.cc | 10 + .../operator_attribute_key.cc | 68 ++ .../materialize_operator_from_attrs_map.cc | 27 +- .../output_graph/output_graph_expr.cc | 18 + .../output_graph/output_graph_expr_value.cc | 30 + .../output_operator_attrs_assignment.cc | 41 +- .../src/substitutions/pcg_pattern.cc | 18 + .../sub_parallel_computation_graph.cc | 55 +- .../sub_parallel_computation_graph_edge.cc | 2 +- .../src/substitutions/substitution.cc | 301 ++++---- .../src/substitutions/substitution_builder.cc | 162 +++++ .../tensor_pattern/eval_list_access.cc | 5 +- .../tensor_pattern/eval_list_size.cc | 5 +- .../tensor_pattern/get_attribute.cc | 10 +- .../tensor_attribute_pattern.cc | 16 + .../substitutions/unity_substitution_set.cc | 235 ++++++ .../unlabelled/input_pattern_edge.cc | 2 +- .../unlabelled/pattern_node_output.cc | 2 +- .../unlabelled/standard_pattern_edge.cc | 4 +- .../apply_substitution/apply_substitution.cc | 174 +++++ .../evaluate_substitution_output.cc | 63 +- .../perform_shape_inference.cc | 63 +- .../operator_pattern/get_attribute.cc | 2 +- .../test/src/substitutions/pcg_pattern.cc | 14 +- .../test/src/substitutions/substitution.cc | 345 ++++----- .../src/substitutions/substitution_builder.cc | 145 ++++ .../substitutions/unity_substitution_set.cc | 20 + .../unlabelled/find_pattern_matches.cc} | 29 +- .../unlabelled/pattern_matching.cc | 210 ++++++ .../substitutions/unlabelled/pattern_split.cc | 8 +- .../unlabelled/unlabelled_graph_pattern.cc | 4 +- .../test/src/test_substitution.cc | 148 ---- .../algorithms/bidict_from_enumerating.h | 14 +- .../utils/cli/cli_flag_key.struct.toml | 6 +- .../cli_positional_argument_key.struct.toml | 6 +- lib/utils/include/utils/containers/at_idx.h | 5 +- .../include/utils/containers/enumerate.h | 16 +- .../utils/containers/enumerate_vector.h | 11 +- lib/utils/include/utils/containers/flatmap.h | 15 +- .../get_all_permutations_with_repetition.h | 10 +- lib/utils/include/utils/containers/make.h | 13 + .../include/utils/containers/merge_maps.h | 60 +- .../utils/containers/merge_method.enum.toml | 17 + lib/utils/include/utils/containers/product.h | 2 +- lib/utils/include/utils/containers/repeat.h | 5 +- .../include/utils/containers/repeat_element.h | 22 + .../include/utils/containers/replicate.h | 15 - lib/utils/include/utils/containers/sum.h | 2 +- .../algorithms/view_as_open_dataflow_graph.h | 34 + .../dataflow_edge_query.struct.toml | 5 +- .../graph/dataflow_graph/dataflow_graph.h | 3 +- .../dataflow_graph/dataflow_input.struct.toml | 3 +- .../dataflow_output.struct.toml | 3 +- .../dataflow_output_query.struct.toml | 6 +- .../graph/dataflow_graph/i_dataflow_graph.h | 2 +- .../instances/unordered_set_dataflow_graph.h | 4 +- ...ordered_set_labelled_open_dataflow_graph.h | 7 +- .../algorithms/get_graph_data.h | 1 + .../algorithms/permute_node_ids.h | 1 + .../graph/multidigraph/algorithms/add_nodes.h | 3 +- .../algorithms/are_isomorphic.h | 13 + .../open_dataflow_graph_isomorphism.h | 21 + .../dataflow_input_edge_query.struct.toml | 3 +- .../i_open_dataflow_graph.h | 2 +- .../open_dataflow_graph/open_dataflow_edge.h | 2 +- .../open_dataflow_graph/open_dataflow_graph.h | 2 +- .../unordered_set_open_dataflow_graph.h | 2 +- lib/utils/include/utils/graph/render_dot.h | 19 + .../include/utils/nonnegative_int/ceildiv.h | 11 + .../utils/nonnegative_int/nonnegative_int.h | 28 +- .../utils/nonnegative_int/nonnegative_range.h | 14 + .../utils/nonnegative_int/num_elements.h | 17 + lib/utils/include/utils/variant.h | 1 + .../algorithms/bidict_from_enumerating.cc | 13 + lib/utils/src/utils/cli/cli_parse.cc | 6 +- lib/utils/src/utils/cli/cli_spec.cc | 15 +- lib/utils/src/utils/containers/at_idx.cc | 9 + lib/utils/src/utils/containers/enumerate.cc | 11 + .../src/utils/containers/enumerate_vector.cc | 9 + lib/utils/src/utils/containers/make.cc | 8 + lib/utils/src/utils/containers/range.cc | 1 + lib/utils/src/utils/containers/repeat.cc | 10 + .../src/utils/containers/repeat_element.cc | 10 + lib/utils/src/utils/containers/replicate.cc | 1 - .../utils/graph/dataflow_graph/algorithms.cc | 2 +- .../graph/dataflow_graph/algorithms/as_dot.cc | 41 +- .../get_dataflow_edges_from_node_to_node.cc | 4 +- .../algorithms/get_incoming_edges.cc | 8 +- .../algorithms/get_outgoing_edges.cc | 8 +- .../algorithms/get_subgraph_incoming_edges.cc | 4 +- .../algorithms/get_subgraph_outgoing_edges.cc | 4 +- .../algorithms/view_as_open_dataflow_graph.cc | 22 +- .../algorithms/view_as_open_dataflow_graph.h | 32 - .../dataflow_graph/dataflow_edge_query.cc | 20 +- .../graph/dataflow_graph/dataflow_graph.cc | 2 +- .../dataflow_graph/dataflow_output_query.cc | 6 +- .../dataflow_graph/i_dataflow_graph_view.cc | 4 +- .../digraph/algorithms/transitive_closure.cc | 5 +- .../algorithms/transitive_reduction.cc | 4 +- .../instances/unordered_set_dataflow_graph.cc | 10 +- .../multidigraph/algorithms/add_nodes.cc | 2 +- .../algorithms/are_isomorphic.cc | 11 + .../open_dataflow_graph/algorithms/as_dot.cc | 11 +- .../algorithms/get_incoming_edges.cc | 6 +- .../algorithms/get_subgraph_incoming_edges.cc | 6 +- .../open_dataflow_graph_isomorphism.cc | 54 ++ .../dataflow_input_edge_query.cc | 10 +- .../open_dataflow_graph/open_dataflow_edge.cc | 2 +- .../open_dataflow_graph.cc | 2 +- .../unordered_set_open_dataflow_graph.cc | 2 +- lib/utils/src/utils/graph/render_dot.cc | 90 +++ .../src/utils/nonnegative_int/ceildiv.cc | 20 + .../utils/nonnegative_int/nonnegative_int.cc | 79 +- .../nonnegative_int/nonnegative_range.cc | 19 + .../src/utils/nonnegative_int/num_elements.cc | 10 + lib/utils/test/src/main.cc | 2 - .../algorithms/bidict_from_enumerating.cc | 19 +- lib/utils/test/src/utils/cli/cli_parse.cc | 34 +- lib/utils/test/src/utils/containers/at_idx.cc | 29 + .../test/src/utils/containers/enumerate.cc | 33 +- .../src/utils/containers/enumerate_vector.cc | 33 + .../test/src/utils/containers/flatmap.cc | 32 + .../get_all_permutations_with_repetition.cc | 8 +- lib/utils/test/src/utils/containers/make.cc | 15 + .../test/src/utils/containers/merge_maps.cc | 78 +- .../test/src/utils/containers/product.cc | 20 + lib/utils/test/src/utils/containers/repeat.cc | 2 +- .../{replicate.cc => repeat_element.cc} | 9 +- .../utils/graph/dataflow_graph/algorithms.cc | 14 +- .../dataflow_graphs_are_isomorphic.cc | 24 +- .../algorithms/find_isomorphism.cc | 24 +- .../get_dataflow_edges_from_node_to_node.cc | 26 +- .../algorithms/get_incoming_edges.cc | 14 +- .../algorithms/get_outgoing_edges.cc | 28 +- .../algorithms/get_subgraph_incoming_edges.cc | 14 +- .../algorithms/get_subgraph_outgoing_edges.cc | 12 +- ...sitive_reduced_boundary_nodes_for_split.cc | 8 +- ...t_transitive_reduced_edges_across_split.cc | 34 +- ...transitive_reduced_outputs_across_split.cc | 8 +- .../unordered_open_dataflow_graph.cc | 8 +- .../multidigraph/algorithms/add_edges.cc | 2 +- .../multidigraph/algorithms/add_nodes.cc | 2 +- .../multidigraph/algorithms/get_edges.cc | 2 +- .../algorithms/find_isomorphism.cc | 23 +- .../get_open_dataflow_graph_inputs.cc | 2 +- .../get_open_dataflow_value_uses.cc | 20 +- .../get_unused_open_dataflow_graph_inputs.cc | 4 +- .../open_dataflow_graphs_are_isomorphic.cc | 23 +- .../algorithms/permute_input_ids.cc | 14 +- .../algorithms/permute_node_ids.cc | 28 +- .../series_parallel/parallel_reduction.cc | 14 +- .../graph/series_parallel/series_reduction.cc | 18 +- .../test/src/utils/nonnegative_int/ceildiv.cc | 52 ++ .../utils/nonnegative_int/nonnegative_int.cc | 90 ++- .../nonnegative_int/nonnegative_range.cc | 42 ++ .../src/utils/nonnegative_int/num_elements.cc | 15 + lib/utils/test/src/utils/random_utils.cc | 6 +- 423 files changed, 7336 insertions(+), 5040 deletions(-) create mode 100644 lib/kernels/include/kernels/batch_norm_per_device_state.struct.toml delete mode 100644 lib/local-execution/include/local-execution/legion_tensor_shape.h delete mode 100644 lib/local-execution/src/legion_tensor_shape.cc delete mode 100644 lib/op-attrs/include/op-attrs/ops/batch_matmul.struct.toml create mode 100644 lib/op-attrs/include/op-attrs/ops/batch_matmul_attrs.struct.toml create mode 100644 lib/substitutions/include/substitutions/apply_substitution/apply_substitution.h rename lib/substitutions/include/substitutions/{substitution_internal => apply_substitution}/evaluate_substitution_output.h (76%) rename lib/substitutions/include/substitutions/{substitution_internal => apply_substitution}/output_expr_to_result_sub_pcg_mapping.h (62%) rename lib/substitutions/include/substitutions/{substitution_internal => apply_substitution}/output_expr_to_result_sub_pcg_mapping.struct.toml (100%) rename lib/substitutions/include/substitutions/{substitution_internal => apply_substitution}/perform_shape_inference.h (85%) create mode 100644 lib/substitutions/include/substitutions/operator_pattern/get_attribute_map.h create mode 100644 lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.h create mode 100644 lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.h create mode 100644 lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.variant.toml create mode 100644 lib/substitutions/include/substitutions/substitution_builder.h create mode 100644 lib/substitutions/include/substitutions/unity_substitution_set.h create mode 100644 lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc rename lib/substitutions/src/substitutions/{substitution_internal => apply_substitution}/evaluate_substitution_output.cc (96%) rename lib/substitutions/src/substitutions/{substitution_internal => apply_substitution}/output_expr_to_result_sub_pcg_mapping.cc (93%) rename lib/substitutions/src/substitutions/{substitution_internal => apply_substitution}/perform_shape_inference.cc (95%) create mode 100644 lib/substitutions/src/substitutions/operator_pattern/get_attribute_map.cc create mode 100644 lib/substitutions/src/substitutions/operator_pattern/operator_attribute_key.cc create mode 100644 lib/substitutions/src/substitutions/output_graph/output_graph_expr_value.cc create mode 100644 lib/substitutions/src/substitutions/substitution_builder.cc create mode 100644 lib/substitutions/src/substitutions/unity_substitution_set.cc create mode 100644 lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc rename lib/substitutions/test/src/substitutions/{substitution_internal => apply_substitution}/evaluate_substitution_output.cc (86%) rename lib/substitutions/test/src/substitutions/{substitution_internal => apply_substitution}/perform_shape_inference.cc (78%) create mode 100644 lib/substitutions/test/src/substitutions/substitution_builder.cc create mode 100644 lib/substitutions/test/src/substitutions/unity_substitution_set.cc rename lib/substitutions/test/src/{test_pattern_matches.cc => substitutions/unlabelled/find_pattern_matches.cc} (94%) create mode 100644 lib/substitutions/test/src/substitutions/unlabelled/pattern_matching.cc delete mode 100644 lib/substitutions/test/src/test_substitution.cc create mode 100644 lib/utils/include/utils/containers/make.h create mode 100644 lib/utils/include/utils/containers/merge_method.enum.toml create mode 100644 lib/utils/include/utils/containers/repeat_element.h delete mode 100644 lib/utils/include/utils/containers/replicate.h create mode 100644 lib/utils/include/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h create mode 100644 lib/utils/include/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h create mode 100644 lib/utils/include/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h create mode 100644 lib/utils/include/utils/graph/render_dot.h create mode 100644 lib/utils/include/utils/nonnegative_int/ceildiv.h create mode 100644 lib/utils/include/utils/nonnegative_int/nonnegative_range.h create mode 100644 lib/utils/include/utils/nonnegative_int/num_elements.h create mode 100644 lib/utils/src/utils/containers/make.cc create mode 100644 lib/utils/src/utils/containers/repeat_element.cc delete mode 100644 lib/utils/src/utils/containers/replicate.cc delete mode 100644 lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h create mode 100644 lib/utils/src/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.cc create mode 100644 lib/utils/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.cc create mode 100644 lib/utils/src/utils/graph/render_dot.cc create mode 100644 lib/utils/src/utils/nonnegative_int/ceildiv.cc create mode 100644 lib/utils/src/utils/nonnegative_int/nonnegative_range.cc create mode 100644 lib/utils/src/utils/nonnegative_int/num_elements.cc delete mode 100644 lib/utils/test/src/main.cc create mode 100644 lib/utils/test/src/utils/containers/at_idx.cc create mode 100644 lib/utils/test/src/utils/containers/enumerate_vector.cc create mode 100644 lib/utils/test/src/utils/containers/make.cc rename lib/utils/test/src/utils/containers/{replicate.cc => repeat_element.cc} (69%) create mode 100644 lib/utils/test/src/utils/nonnegative_int/ceildiv.cc create mode 100644 lib/utils/test/src/utils/nonnegative_int/nonnegative_range.cc create mode 100644 lib/utils/test/src/utils/nonnegative_int/num_elements.cc diff --git a/bin/export-model-arch/src/export_model_arch.cc b/bin/export-model-arch/src/export_model_arch.cc index 64419acce4..a9f6c65b86 100644 --- a/bin/export-model-arch/src/export_model_arch.cc +++ b/bin/export-model-arch/src/export_model_arch.cc @@ -13,6 +13,7 @@ #include "utils/cli/cli_parse.h" #include "utils/cli/cli_parse_result.h" #include "utils/cli/cli_spec.h" +#include "utils/graph/open_dataflow_graph/algorithms/as_dot.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/right_associative_binary_sp_tree_from_nary.h" #include "utils/graph/series_parallel/get_series_parallel_decomposition.h" @@ -21,11 +22,11 @@ using namespace ::FlexFlow; ComputationGraph get_single_operator_computation_graph() { ComputationGraphBuilder b; - size_t batch_size = 8; - size_t in_channels = 16; - size_t out_channels = 12; + nonnegative_int batch_size = 8_n; + nonnegative_int in_channels = 16_n; + nonnegative_int out_channels = 12_n; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ batch_size, in_channels, out_channels, @@ -69,7 +70,7 @@ tl::expected } else if (model_name == "bert") { return get_bert_computation_graph(get_default_bert_config()); } else if (model_name == "split_test") { - int batch_size = 8; + nonnegative_int batch_size = 8_n; return get_split_test_computation_graph(batch_size); } else if (model_name == "single_operator") { return get_single_operator_computation_graph(); @@ -100,10 +101,10 @@ tl::expected result.value(); }); - std::pair> v1_result = - to_v1_including_node_numbering(computation_graph); + std::pair> + v1_result = to_v1_including_node_numbering(computation_graph); V1ComputationGraph v1_cg = v1_result.first; - bidict layer_numbering = v1_result.second; + bidict layer_numbering = v1_result.second; V1BinarySPDecomposition v1_sp_decomposition = to_v1(sp_decomposition, layer_numbering); diff --git a/cmake/flexflow-utils.cmake b/cmake/flexflow-utils.cmake index 7ba39e92c9..515a249521 100644 --- a/cmake/flexflow-utils.cmake +++ b/cmake/flexflow-utils.cmake @@ -20,6 +20,7 @@ function(define_ff_vars target) MAX_TENSOR_DIM=${FF_MAX_DIM} MAX_NUM_TASK_REGIONS=${FF_MAX_NUM_TASK_REGIONS} MAX_NUM_TASK_ARGUMENTS=${FF_MAX_NUM_TASK_ARGUMENTS} + # _FORTIFY_SOURCE=0 ) if (FF_GPU_BACKEND STREQUAL "cuda") @@ -39,7 +40,18 @@ function(ff_set_cxx_properties target) CXX_EXTENSIONS NO ) target_compile_options(${target} - PRIVATE $<$:> "-ffile-prefix-map=${CMAKE_SOURCE_DIR}=." # add C++ compile flags here + PUBLIC + $<$:> + "-ffile-prefix-map=${CMAKE_SOURCE_DIR}=." + "-fsanitize=undefined" + "-fno-sanitize-recover=all" + # add C++ compile flags here + ) + target_link_options(${target} + PUBLIC + $<$:> + "-fsanitize=undefined" + "-fno-sanitize-recover=all" ) endfunction() diff --git a/flake.nix b/flake.nix index 91651bd0c1..e4644ef727 100644 --- a/flake.nix +++ b/flake.nix @@ -38,9 +38,15 @@ }; lib = pkgs.lib; - mkShell = pkgs.mkShell.override { + mkShell = attrs: pkgs.mkShell.override { stdenv = pkgs.cudaPackages.backendStdenv; - }; + } (attrs // { + hardeningDisable = ["all"]; # disable nixpkgs default compiler arguments, otherwise ubsan doesn't catch + # signed overflows due to the signedoverflow hardening setting. + # for more details, see the following (long-running) nixpkgs github issues: + # - https://github.com/NixOS/nixpkgs/issues/18995 + # - https://github.com/NixOS/nixpkgs/issues/60919 + }); proj = proj-repo.packages.${system}.proj; in @@ -121,6 +127,8 @@ gpu-ci = mkShell { inputsFrom = [ ci ]; + hardeningDisable = [ "all" ]; + buildInputs = builtins.concatLists [ (with nixGL.packages.${system}; [ nixGLDefault @@ -135,6 +143,8 @@ "${proj-repo.packages.${system}.proj-nvim}" ]; + hardeningDisable = [ "all" ]; + buildInputs = builtins.concatLists [ (with pkgs; [ clang-tools diff --git a/lib/compiler/include/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.h b/lib/compiler/include/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.h index fdc80a1e37..8a7c467303 100644 --- a/lib/compiler/include/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.h +++ b/lib/compiler/include/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.h @@ -36,8 +36,9 @@ bool is_right_associative(ComputationGraphBinarySPDecomposition const &); std::unordered_multiset get_layers(ComputationGraphBinarySPDecomposition const &); -V1BinarySPDecomposition to_v1(ComputationGraphBinarySPDecomposition const &, - bidict const &layer_numbering); +V1BinarySPDecomposition + to_v1(ComputationGraphBinarySPDecomposition const &, + bidict const &layer_numbering); } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/allowed_machine_views.cc b/lib/compiler/src/compiler/allowed_machine_views.cc index db7477b460..6f86d1d82a 100644 --- a/lib/compiler/src/compiler/allowed_machine_views.cc +++ b/lib/compiler/src/compiler/allowed_machine_views.cc @@ -11,12 +11,15 @@ #include "utils/containers/map_from_keys_and_values.h" #include "utils/containers/product.h" #include "utils/containers/range.h" -#include "utils/containers/replicate.h" +#include "utils/containers/repeat_element.h" #include "utils/containers/sorted.h" #include "utils/containers/transform.h" #include "utils/containers/unordered_multiset_of.h" #include "utils/containers/unordered_set_of.h" #include "utils/containers/zip.h" +#include "utils/nonnegative_int/ceildiv.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/nonnegative_int/num_elements.h" #include "utils/overload.h" namespace FlexFlow { @@ -47,24 +50,29 @@ static std::unordered_set OperatorTaskSpace const &task, DeviceType const &device_type) { - auto get_max_stride_upper_bound = [](std::vector const &tensor_dims, - int total_devices) -> int { - int min_num_devices_with_full_stride_volume = product(transform( - tensor_dims, [](int const &num_devices) { return num_devices - 1; })); - return std::ceil(total_devices / min_num_devices_with_full_stride_volume); + auto get_max_stride_upper_bound = + [](std::vector const &tensor_dims, + nonnegative_int total_devices) -> nonnegative_int { + nonnegative_int min_num_devices_with_full_stride_volume = + product(transform(tensor_dims, [](nonnegative_int num_devices) { + return nonnegative_int{num_devices.unwrap_nonnegative() - 1}; + })); + return ceildiv(total_devices, min_num_devices_with_full_stride_volume); }; - auto candidate_strides = [&](std::vector const &tensor_dims, - int total_devices) + auto candidate_strides = [&](std::vector const &tensor_dims, + nonnegative_int total_devices) -> std::unordered_multiset { - int max_stride_upper_bound = + nonnegative_int max_stride_upper_bound = get_max_stride_upper_bound(tensor_dims, total_devices); std::vector single_stride_range = - transform(range(1, max_stride_upper_bound + 1), - [](int stride) { return stride_t{stride}; }); + transform(nonnegative_range(1_n, max_stride_upper_bound + 1_n), + [](nonnegative_int stride) { return stride_t{stride}; }); std::unordered_multiset> raw_stride_vectors = - cartesian_product(replicate(tensor_dims.size(), single_stride_range)); + cartesian_product( + repeat_element(/*num_times=*/num_elements(tensor_dims), + /*element=*/single_stride_range)); std::unordered_multiset strides = transform(raw_stride_vectors, [](auto const &stride_vec) { return MultiDimensionalStride{stride_vec}; @@ -75,8 +83,9 @@ static std::unordered_set auto candidate_starts = [](MachineSpecification const &ms, DeviceType const &device_type) { std::unordered_set result; - for (int node_idx : range(ms.num_nodes)) { - for (int device_idx : range(get_num_devices_per_node(ms, device_type))) { + for (nonnegative_int node_idx : nonnegative_range(ms.num_nodes)) { + for (nonnegative_int device_idx : + nonnegative_range(get_num_devices_per_node(ms, device_type))) { result.insert( MachineSpaceCoordinate{node_idx, device_idx, device_type}); } @@ -91,8 +100,8 @@ static std::unordered_set return get_all_permutations_with_repetition(options, num_dims(task)); }; - std::vector tensor_dims = task.degrees; - int total_devices = get_num_devices(machine_spec, device_type); + std::vector tensor_dims = task.degrees; + nonnegative_int total_devices = get_num_devices(machine_spec, device_type); std::unordered_set machine_views; diff --git a/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc b/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc index 5126d9687e..bb9d54f1e9 100644 --- a/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc +++ b/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc @@ -11,8 +11,9 @@ std::unordered_set> for (int i = 1; i < resource.num_nodes; i *= 2) { MachineSpecification sub_resource1 = resource; MachineSpecification sub_resource2 = resource; - sub_resource1.num_nodes = i; - sub_resource2.num_nodes = resource.num_nodes - i; + sub_resource1.num_nodes = nonnegative_int{i}; + sub_resource2.num_nodes = + nonnegative_int{resource.num_nodes.unwrap_nonnegative() - i}; result.insert(std::make_pair(sub_resource1, sub_resource2)); result.insert(std::make_pair(sub_resource2, sub_resource1)); } @@ -20,8 +21,9 @@ std::unordered_set> for (int i = 1; i < resource.num_gpus_per_node; i *= 2) { MachineSpecification sub_resource1 = resource; MachineSpecification sub_resource2 = resource; - sub_resource1.num_gpus_per_node = i; - sub_resource2.num_gpus_per_node = resource.num_gpus_per_node - i; + sub_resource1.num_gpus_per_node = nonnegative_int{i}; + sub_resource2.num_gpus_per_node = + nonnegative_int{resource.num_gpus_per_node.unwrap_nonnegative() - i}; result.insert(std::make_pair(sub_resource1, sub_resource2)); result.insert(std::make_pair(sub_resource2, sub_resource1)); } diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc index fc3a58995c..82c8274808 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc @@ -1,20 +1,14 @@ #include "compiler/machine_mapping/machine_mapping.h" -#include "pcg/machine_specification.h" -#include "pcg/machine_view.h" -#include "pcg/operator_task_space.dtg.h" -#include "pcg/operator_task_space.h" -#include "pcg/parallel_computation_graph/parallel_computation_graph.h" #include "utils/containers/are_disjoint.h" -#include "utils/containers/get_one_of.h" #include "utils/containers/keys.h" -#include "utils/containers/map_values.h" #include "utils/containers/merge_maps.h" namespace FlexFlow { MachineMapping combine_disjoint_mappings(MachineMapping const &m1, MachineMapping const &m2) { - return MachineMapping{merge_maps(m1.machine_views, m2.machine_views)}; + return MachineMapping{ + merge_disjoint_maps(m1.machine_views, m2.machine_views)}; } bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2) { diff --git a/lib/compiler/src/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.cc index 715a4c2e3d..ed60004bf4 100644 --- a/lib/compiler/src/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.cc @@ -10,8 +10,8 @@ ParallelLayerGuidObliviousMachineMapping binary_combine_mappings( ParallelLayerGuidObliviousMachineMapping const &lhs, ParallelLayerGuidObliviousMachineMapping const &rhs) { return ParallelLayerGuidObliviousMachineMapping{ - merge_maps(map_keys(lhs.raw_mapping, nest_inside_left_child), - map_keys(rhs.raw_mapping, nest_inside_right_child)), + merge_disjoint_maps(map_keys(lhs.raw_mapping, nest_inside_left_child), + map_keys(rhs.raw_mapping, nest_inside_right_child)), }; } diff --git a/lib/compiler/src/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.cc b/lib/compiler/src/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.cc index 32fb53b58a..9886468386 100644 --- a/lib/compiler/src/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.cc +++ b/lib/compiler/src/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.cc @@ -164,7 +164,7 @@ std::unordered_multiset V1BinarySPDecomposition to_v1(ComputationGraphBinarySPDecomposition const &tree, - bidict const &layer_numbering) { + bidict const &layer_numbering) { return tree.visit( overload{[&](ComputationGraphBinarySeriesSplit const &series) { return V1BinarySPDecomposition{ diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/allowed_machine_views.cc index 936894ad2d..817cc80700 100644 --- a/lib/compiler/test/src/allowed_machine_views.cc +++ b/lib/compiler/test/src/allowed_machine_views.cc @@ -15,39 +15,39 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("1 degree of parallelism") { MachineSpecification ms = MachineSpecification{ - /*num_nodes=*/1, - /*num_cpus_per_node=*/5, - /*num_gpus_per_node=*/5, + /*num_nodes=*/1_n, + /*num_cpus_per_node=*/5_n, + /*num_gpus_per_node=*/5_n, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0, }; - OperatorTaskSpace task = OperatorTaskSpace{{3}}; + OperatorTaskSpace task = OperatorTaskSpace{{3_n}}; std::unordered_set correct = { MachineView{ MachineSpaceCoordinate{ - /*node_idx=*/0, /*device_idx=*/0, DeviceType::GPU}, - {MachineViewDimension{stride_t{1}, + /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU}, + {MachineViewDimension{stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE}}, }, MachineView{ MachineSpaceCoordinate{ - /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU}, - {MachineViewDimension{stride_t{1}, + /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU}, + {MachineViewDimension{stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE}}, }, MachineView{ MachineSpaceCoordinate{ - /*node_idx=*/0, /*device_idx=*/2, DeviceType::GPU}, - {MachineViewDimension{stride_t{1}, + /*node_idx=*/0_n, /*device_idx=*/2_n, DeviceType::GPU}, + {MachineViewDimension{stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE}}, }, MachineView{ MachineSpaceCoordinate{ - /*node_idx=*/0, /*device_idx=*/0, DeviceType::GPU}, - {MachineViewDimension{stride_t{2}, + /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU}, + {MachineViewDimension{stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE}}, }, }; @@ -61,18 +61,18 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("2 degrees of parallelism") { MachineSpecification ms = MachineSpecification{ - /*num_nodes=*/3, - /*num_cpus_per_node=*/3, - /*num_gpus_per_node=*/3, + /*num_nodes=*/3_n, + /*num_cpus_per_node=*/3_n, + /*num_gpus_per_node=*/3_n, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0, }; - OperatorTaskSpace task = OperatorTaskSpace{{2, 3}}; + OperatorTaskSpace task = OperatorTaskSpace{{2_n, 3_n}}; - auto make_2d_view = [&](int start_node_idx, - int start_device_idx, - int stride1, - int stride2, + auto make_2d_view = [&](nonnegative_int start_node_idx, + nonnegative_int start_device_idx, + nonnegative_int stride1, + nonnegative_int stride2, MachineSpecificationDimension m1, MachineSpecificationDimension m2) { return MachineView{ @@ -86,13 +86,19 @@ TEST_SUITE(FF_TEST_SUITE) { auto intra = MachineSpecificationDimension::INTRA_NODE; auto inter = MachineSpecificationDimension::INTER_NODE; std::unordered_set correct = { - make_2d_view(0, 0, /*stride1=*/1, /*stride2=*/1, inter, intra), - make_2d_view(1, 0, /*stride1=*/1, /*stride2=*/1, inter, intra), - make_2d_view(0, 0, /*stride1=*/2, /*stride2=*/1, inter, intra), - - make_2d_view(0, 0, /*stride1=*/1, /*stride2=*/1, intra, inter), - make_2d_view(0, 1, /*stride1=*/1, /*stride2=*/1, intra, inter), - make_2d_view(0, 0, /*stride1=*/2, /*stride2=*/1, intra, inter), + make_2d_view( + 0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra), + make_2d_view( + 1_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra), + make_2d_view( + 0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, inter, intra), + + make_2d_view( + 0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter), + make_2d_view( + 0_n, 1_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter), + make_2d_view( + 0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, intra, inter), }; std::unordered_set result = diff --git a/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc index 5c8ea1c0f1..b0d86124a1 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc @@ -28,12 +28,12 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{10, 2}, - ShardParallelDim{12, 1}, + ShardParallelDim{10_n, 2_n}, + ShardParallelDim{12_n, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc b/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc index 499b111f8f..5f4ba2bfdc 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc @@ -8,10 +8,11 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_machine_resource_splits") { - auto make_machine_spec = [](int num_nodes, int num_gpus_per_node) { + auto make_machine_spec = [](nonnegative_int num_nodes, + nonnegative_int num_gpus_per_node) { return MachineSpecification{ /*num_nodes=*/num_nodes, - /*num_cpus_per_node=*/1, + /*num_cpus_per_node=*/1_n, /*num_gpus_per_node=*/num_gpus_per_node, /*inter_node_bandwidth=*/1.0, /*intra_node_bandwidth=*/1.0, @@ -19,8 +20,8 @@ TEST_SUITE(FF_TEST_SUITE) { }; SUBCASE("returns no splits if no splits are possible") { - MachineSpecification input = make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/1); + MachineSpecification input = make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/1_n); std::unordered_set> result = get_machine_resource_splits(input); @@ -32,8 +33,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE( "returns splits in gpu and node dimensions, but not at the same time") { - MachineSpecification input = make_machine_spec(/*num_nodes=*/2, - /*num_gpus_per_node=*/2); + MachineSpecification input = make_machine_spec(/*num_nodes=*/2_n, + /*num_gpus_per_node=*/2_n); std::unordered_set> result = get_machine_resource_splits(input); @@ -41,16 +42,16 @@ TEST_SUITE(FF_TEST_SUITE) { std::unordered_set> correct = { { - make_machine_spec(/*num_nodes=*/2, - /*num_gpus_per_node=*/1), - make_machine_spec(/*num_nodes=*/2, - /*num_gpus_per_node=*/1), + make_machine_spec(/*num_nodes=*/2_n, + /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/2_n, + /*num_gpus_per_node=*/1_n), }, { - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/2), - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/2), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/2_n), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/2_n), }, }; @@ -60,8 +61,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("returns splits in node dimension in powers of two") { SUBCASE("num_nodes is a power of 2") { - MachineSpecification input = make_machine_spec(/*num_nodes=*/8, - /*num_gpus_per_node=*/1); + MachineSpecification input = + make_machine_spec(/*num_nodes=*/8_n, + /*num_gpus_per_node=*/1_n); std::unordered_set< std::pair> @@ -71,34 +73,34 @@ TEST_SUITE(FF_TEST_SUITE) { std::pair> correct = { { - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/1), - make_machine_spec(/*num_nodes=*/7, - /*num_gpus_per_node=*/1), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/7_n, + /*num_gpus_per_node=*/1_n), }, { - make_machine_spec(/*num_nodes=*/2, - /*num_gpus_per_node=*/1), - make_machine_spec(/*num_nodes=*/6, - /*num_gpus_per_node=*/1), + make_machine_spec(/*num_nodes=*/2_n, + /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/6_n, + /*num_gpus_per_node=*/1_n), }, { - make_machine_spec(/*num_nodes=*/4, - /*num_gpus_per_node=*/1), - make_machine_spec(/*num_nodes=*/4, - /*num_gpus_per_node=*/1), + make_machine_spec(/*num_nodes=*/4_n, + /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/4_n, + /*num_gpus_per_node=*/1_n), }, { - make_machine_spec(/*num_nodes=*/6, - /*num_gpus_per_node=*/1), - make_machine_spec(/*num_nodes=*/2, - /*num_gpus_per_node=*/1), + make_machine_spec(/*num_nodes=*/6_n, + /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/2_n, + /*num_gpus_per_node=*/1_n), }, { - make_machine_spec(/*num_nodes=*/7, - /*num_gpus_per_node=*/1), - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/1), + make_machine_spec(/*num_nodes=*/7_n, + /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/1_n), }, }; @@ -106,8 +108,9 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("num_nodes is not a power of 2") { - MachineSpecification input = make_machine_spec(/*num_nodes=*/6, - /*num_gpus_per_node=*/1); + MachineSpecification input = + make_machine_spec(/*num_nodes=*/6_n, + /*num_gpus_per_node=*/1_n); std::unordered_set< std::pair> @@ -117,28 +120,28 @@ TEST_SUITE(FF_TEST_SUITE) { std::pair> correct = { { - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/1), - make_machine_spec(/*num_nodes=*/5, - /*num_gpus_per_node=*/1), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/5_n, + /*num_gpus_per_node=*/1_n), }, { - make_machine_spec(/*num_nodes=*/2, - /*num_gpus_per_node=*/1), - make_machine_spec(/*num_nodes=*/4, - /*num_gpus_per_node=*/1), + make_machine_spec(/*num_nodes=*/2_n, + /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/4_n, + /*num_gpus_per_node=*/1_n), }, { - make_machine_spec(/*num_nodes=*/4, - /*num_gpus_per_node=*/1), - make_machine_spec(/*num_nodes=*/2, - /*num_gpus_per_node=*/1), + make_machine_spec(/*num_nodes=*/4_n, + /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/2_n, + /*num_gpus_per_node=*/1_n), }, { - make_machine_spec(/*num_nodes=*/5, - /*num_gpus_per_node=*/1), - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/1), + make_machine_spec(/*num_nodes=*/5_n, + /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/1_n), }, }; @@ -148,8 +151,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("returns splits in gpu dimension in powers of two") { SUBCASE("num_gpus_per_node is a power of 2") { - MachineSpecification input = make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/8); + MachineSpecification input = + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/8_n); std::unordered_set< std::pair> @@ -159,34 +163,34 @@ TEST_SUITE(FF_TEST_SUITE) { std::pair> correct = { { - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/1), - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/7), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/7_n), }, { - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/2), - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/6), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/2_n), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/6_n), }, { - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/4), - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/4), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/4_n), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/4_n), }, { - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/6), - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/2), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/6_n), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/2_n), }, { - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/7), - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/1), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/7_n), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/1_n), }, }; @@ -194,8 +198,9 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("num_gpus_per_node is not a power of 2") { - MachineSpecification input = make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/6); + MachineSpecification input = + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/6_n); std::unordered_set< std::pair> @@ -205,28 +210,28 @@ TEST_SUITE(FF_TEST_SUITE) { std::pair> correct = { { - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/1), - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/5), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/5_n), }, { - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/2), - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/4), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/2_n), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/4_n), }, { - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/4), - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/2), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/4_n), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/2_n), }, { - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/5), - make_machine_spec(/*num_nodes=*/1, - /*num_gpus_per_node=*/1), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/5_n), + make_machine_spec(/*num_nodes=*/1_n, + /*num_gpus_per_node=*/1_n), }, }; } diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index 542edd9fa9..c5b891781d 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -45,14 +45,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView mv1 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{1}, + stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -60,31 +60,31 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView mv2 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{2}, + stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE, }, }, }; MachineSpecification full_machine_spec = MachineSpecification{ - /*num_nodes=*/2, - /*num_cpus_per_node=*/1, - /*num_gpus_per_node=*/1, + /*num_nodes=*/2_n, + /*num_cpus_per_node=*/1_n, + /*num_gpus_per_node=*/1_n, /*inter_node_bandwidth=*/1, /*intra_node_bandwidth=*/1, }; MachineSpecification split_machine_spec = MachineSpecification{ - /*num_nodes=*/1, - /*num_cpus_per_node=*/1, - /*num_gpus_per_node=*/1, + /*num_nodes=*/1_n, + /*num_cpus_per_node=*/1_n, + /*num_gpus_per_node=*/1_n, /*inter_node_bandwidth=*/1, /*intra_node_bandwidth=*/1, }; @@ -121,8 +121,8 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorDims{ FFOrdered{}, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc index 52ad82595d..642fdf7ae1 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc @@ -30,12 +30,12 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{10, 2}, - ShardParallelDim{12, 1}, + ShardParallelDim{10_n, 2_n}, + ShardParallelDim{12_n, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -66,14 +66,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView pre_mv1 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{1}, + stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -81,14 +81,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView pre_mv2 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{2}, + stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -96,14 +96,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView post_mv1 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{3}, + stride_t{3_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -111,14 +111,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView post_mv2 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{4}, + stride_t{4_n}, MachineSpecificationDimension::INTRA_NODE, }, }, diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc index 304034f9be..e88b714bd4 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc @@ -9,14 +9,14 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("combine_disjoint_mappings(MachineMapping, MachineMappping)") { MachineView machine_view_0 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{1}, + stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -24,14 +24,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView machine_view_1 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{2}, + stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -55,14 +55,14 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("nodes_are_disjoint(MachineMapping, MachineMappping)") { MachineView machine_view_0 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{1}, + stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -70,14 +70,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView machine_view_1 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{2}, + stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE, }, }, diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc index 06ab1e5b8c..a8ec24de63 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc @@ -65,11 +65,11 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{10, 1}, + ShardParallelDim{10_n, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc index 73b921fc98..4a261bcdae 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc @@ -8,14 +8,14 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("series_combine") { MachineView machine_view_0 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{1}, + stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -23,14 +23,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView machine_view_1 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{2}, + stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -189,14 +189,14 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("parallel_combine") { MachineView machine_view_0 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{1}, + stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -204,14 +204,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView machine_view_1 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{2}, + stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -312,14 +312,14 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("minimize_runtime") { MachineView machine_view_0 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{1}, + stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -327,14 +327,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView machine_view_1 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{2}, + stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE, }, }, diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc index 8612017705..313f24c384 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc @@ -45,14 +45,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView mv1 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{1}, + stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -60,31 +60,31 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView mv2 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{2}, + stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE, }, }, }; MachineSpecification full_machine_spec = MachineSpecification{ - /*num_nodes=*/2, - /*num_cpus_per_node=*/1, - /*num_gpus_per_node=*/1, + /*num_nodes=*/2_n, + /*num_cpus_per_node=*/1_n, + /*num_gpus_per_node=*/1_n, /*inter_node_bandwidth=*/1, /*intra_node_bandwidth=*/1, }; MachineSpecification split_machine_spec = MachineSpecification{ - /*num_nodes=*/1, - /*num_cpus_per_node=*/1, - /*num_gpus_per_node=*/1, + /*num_nodes=*/1_n, + /*num_cpus_per_node=*/1_n, + /*num_gpus_per_node=*/1_n, /*inter_node_bandwidth=*/1, /*intra_node_bandwidth=*/1, }; @@ -121,8 +121,8 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorDims{ FFOrdered{}, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc index 1f3b7545a8..04149cae8f 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc @@ -9,14 +9,14 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("remove_non_pareto_optimal_machine_mapping_result") { MachineView machine_view_0 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{1}, + stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -24,14 +24,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView machine_view_1 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{2}, + stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -39,14 +39,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView machine_view_2 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{4}, + stride_t{4_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -55,19 +55,19 @@ TEST_SUITE(FF_TEST_SUITE) { OpCostMetrics cost1 = OpCostMetrics{ /*forward_runtime=*/2.0, /*backward_runtime=*/2.0, - /*memory=*/nonnegative_int{2}, + /*memory=*/2_n, }; OpCostMetrics cost2 = OpCostMetrics{ /*forward_runtime=*/4.0, /*backward_runtime=*/4.0, - /*memory=*/nonnegative_int{1}, + /*memory=*/1_n, }; OpCostMetrics cost3 = OpCostMetrics{ /*forward_runtime=*/2.0, /*backward_runtime=*/2.0, - /*memory=*/nonnegative_int{3}, + /*memory=*/3_n, }; MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{ @@ -159,14 +159,14 @@ TEST_SUITE(FF_TEST_SUITE) { "std::optional const&)") { MachineView machine_view_0 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{1}, + stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -174,14 +174,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView machine_view_1 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{2}, + stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -190,7 +190,7 @@ TEST_SUITE(FF_TEST_SUITE) { OpCostMetrics pre_cost = OpCostMetrics{ /*forward_runtime=*/2.0, /*backward_runtime=*/2.0, - /*memory=*/nonnegative_int{2}, + /*memory=*/2_n, }; MachineMappingWithMemoryResult pre = MachineMappingWithMemoryResult{{ MachineMappingForSingleLayer{ @@ -217,7 +217,7 @@ TEST_SUITE(FF_TEST_SUITE) { OpCostMetrics post_cost = OpCostMetrics{ /*forward_runtime=*/4.0, /*backward_runtime=*/4.0, - /*memory=*/nonnegative_int{1}, + /*memory=*/1_n, }; MachineMappingWithMemoryResult post = MachineMappingWithMemoryResult{{ @@ -360,14 +360,14 @@ TEST_SUITE(FF_TEST_SUITE) { "std::optional const&)") { MachineView machine_view_0 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{1}, + stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -375,14 +375,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView machine_view_1 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{2}, + stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -391,7 +391,7 @@ TEST_SUITE(FF_TEST_SUITE) { OpCostMetrics lhs_cost = OpCostMetrics{ /*forward_runtime=*/2.0, /*backward_runtime=*/2.0, - /*memory=*/nonnegative_int{2}, + /*memory=*/2_n, }; MachineMappingWithMemoryResult lhs = MachineMappingWithMemoryResult{{ MachineMappingForSingleLayer{ @@ -418,7 +418,7 @@ TEST_SUITE(FF_TEST_SUITE) { OpCostMetrics rhs_cost = OpCostMetrics{ /*forward_runtime=*/4.0, /*backward_runtime=*/4.0, - /*memory=*/nonnegative_int{1}, + /*memory=*/1_n, }; MachineMappingWithMemoryResult rhs = MachineMappingWithMemoryResult{{ MachineMappingForSingleLayer{ @@ -492,14 +492,14 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("minimize_runtime(memory)") { MachineView machine_view_0 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{1}, + stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -507,14 +507,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView machine_view_1 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{2}, + stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -522,14 +522,14 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView machine_view_2 = MachineView{ /*start=*/MachineSpaceCoordinate{ - /*node_idx=*/0, - /*device_idx=*/0, + /*node_idx=*/0_n, + /*device_idx=*/0_n, /*device_type=*/DeviceType::GPU, }, /*dimensions=*/ { MachineViewDimension{ - stride_t{4}, + stride_t{4_n}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -538,17 +538,17 @@ TEST_SUITE(FF_TEST_SUITE) { OpCostMetrics cost1 = OpCostMetrics{ /*forward_runtime=*/2.0, /*backward_runtime=*/2.0, - /*memory=*/nonnegative_int{2}, + /*memory=*/2_n, }; OpCostMetrics cost2 = OpCostMetrics{ /*forward_runtime=*/4.0, /*backward_runtime=*/4.0, - /*memory=*/nonnegative_int{1}, + /*memory=*/1_n, }; OpCostMetrics cost3 = OpCostMetrics{ /*forward_runtime=*/2.0, /*backward_runtime=*/2.0, - /*memory=*/nonnegative_int{3}, + /*memory=*/3_n, }; MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{ diff --git a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc index 2b59669aad..d0f289043c 100644 --- a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc +++ b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc @@ -29,11 +29,12 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraph cg = [&] { ComputationGraphBuilder b; - TensorShape input_shape = TensorShape{TensorDims{FFOrdered{ - 10, - 12, - }}, - DataType::FLOAT}; + TensorShape input_shape = + TensorShape{TensorDims{FFOrdered{ + 10_n, + 12_n, + }}, + DataType::FLOAT}; b.create_input(input_shape, CreateGrad::YES, input_layer_name); return b.computation_graph; @@ -57,16 +58,17 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraph cg = [&] { ComputationGraphBuilder b; - TensorShape input_shape = TensorShape{TensorDims{FFOrdered{ - 10, - 12, - }}, - DataType::FLOAT}; + TensorShape input_shape = + TensorShape{TensorDims{FFOrdered{ + 10_n, + 12_n, + }}, + DataType::FLOAT}; tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES, input_layer_name); b.dense(input, - /*outDim=*/14, + /*outDim=*/14_n, /*activation=*/std::nullopt, /*use_bias=*/true, /*data_type=*/DataType::FLOAT, @@ -119,9 +121,9 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 10, - 12, + TensorDims{FFOrdered{ + 10_n, + 12_n, }}, DataType::FLOAT, }; @@ -129,7 +131,7 @@ TEST_SUITE(FF_TEST_SUITE) { b.create_input(input_shape, CreateGrad::YES, input_name); b.dense(input, - /*outDim=*/14, + /*outDim=*/14_n, /*activation=*/std::nullopt, /*use_bias=*/false, /*data_type=*/DataType::FLOAT, @@ -138,7 +140,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*name=*/op1_name, /*projection_name=*/w1_name); b.dense(input, - /*outDim=*/14, + /*outDim=*/14_n, /*activation=*/std::nullopt, /*use_bias=*/false, /*data_type=*/DataType::FLOAT, @@ -189,9 +191,9 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 10, - 12, + TensorDims{FFOrdered{ + 10_n, + 12_n, }}, DataType::FLOAT, }; @@ -246,9 +248,9 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 10, - 12, + TensorDims{FFOrdered{ + 10_n, + 12_n, }}, DataType::FLOAT, }; @@ -277,7 +279,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("real models") { SUBCASE("split_test") { ComputationGraph cg = - get_split_test_computation_graph(/*batch_size=*/8); + get_split_test_computation_graph(/*batch_size=*/8_n); std::optional sp_decomposition = get_computation_graph_series_parallel_decomposition(cg); @@ -339,14 +341,15 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraph cg = [&] { ComputationGraphBuilder b; - TensorShape input_shape = TensorShape{TensorDims{FFOrdered{ - 10, - 12, - }}, - DataType::FLOAT}; + TensorShape input_shape = + TensorShape{TensorDims{FFOrdered{ + 10_n, + 12_n, + }}, + DataType::FLOAT}; tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES); - b.dense(input, /*outDim=*/14); + b.dense(input, /*outDim=*/14_n); return b.computation_graph; }(); @@ -356,7 +359,8 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("split_test") { - ComputationGraph cg = get_split_test_computation_graph(/*batch_size=*/8); + ComputationGraph cg = + get_split_test_computation_graph(/*batch_size=*/8_n); std::string result = render_preprocessed_computation_graph_for_sp_decomposition(cg); diff --git a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc index e278338440..d262539dc1 100644 --- a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc +++ b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc @@ -38,9 +38,9 @@ namespace FlexFlow { TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("task_simulator_estimate_forward_pass_time") { MachineSpecification machine_spec = - MachineSpecification{/*num_nodes=*/3, - /*num_cpus_per_node=*/3, - /*num_gpus_per_node=*/3, + MachineSpecification{/*num_nodes=*/3_n, + /*num_cpus_per_node=*/3_n, + /*num_gpus_per_node=*/3_n, /*inter_node_bandwidth=*/1.0f, /*intra_node_bandwidth=*/1.0f}; @@ -50,8 +50,8 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorDims{ FFOrdered{}, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -63,16 +63,16 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_layer_guid_t layer1 = get_source_layer(tensor1); std::vector dims = { - MachineViewDimension{stride_t{1}, + MachineViewDimension{stride_t{1_n}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{1}, + MachineViewDimension{stride_t{1_n}, MachineSpecificationDimension::INTER_NODE}, }; ParallelComputationGraph pcg = b.pcg; MachineView mv1 = - MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims}; + MachineView{MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU}, dims}; MachineView mv2 = - MachineView{MachineSpaceCoordinate{0, 1, DeviceType::GPU}, dims}; + MachineView{MachineSpaceCoordinate{0_n, 1_n, DeviceType::GPU}, dims}; MachineMapping device_mapping = MachineMapping{{ {layer0, mv1}, @@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*forward_op_cost=*/10.0f, /*backward_op_cost=*/10.0f, /*comm_cost=*/1.0f, - /*memory_cost=*/nonnegative_int{0}); + /*memory_cost=*/0_n); float result = task_simulator_estimate_forward_pass_time( pcg, estimator, device_mapping, machine_spec); @@ -99,16 +99,16 @@ TEST_SUITE(FF_TEST_SUITE) { if (op.op_attrs.has()) { return OpCostMetrics{/*forward_runtime=*/10.0f, /*backward_runtime=*/10.0f, - /*memory=*/nonnegative_int{0}}; // layer0 + /*memory=*/0_n}; // layer0 } if (op.op_attrs.has()) { return OpCostMetrics{/*forward_runtime=*/1.0f, /*backward_runtime=*/1.0f, - /*memory=*/nonnegative_int{0}}; // layer1 + /*memory=*/0_n}; // layer1 } return OpCostMetrics{/*forward_runtime=*/0.0f, /*backward_runtime=*/0.0f, - /*memory=*/nonnegative_int{0}}; + /*memory=*/0_n}; }, [](TensorSetMovement const &comm) { return 5.0f; }); @@ -124,10 +124,10 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ - FFOrdered{ShardParallelDim{10, 1}}, + FFOrdered{ShardParallelDim{10_n, 1_n}}, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -145,23 +145,23 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelComputationGraph pcg = b.pcg; std::vector dims = { - MachineViewDimension{stride_t{1}, + MachineViewDimension{stride_t{1_n}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{1}, + MachineViewDimension{stride_t{1_n}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{1}, + MachineViewDimension{stride_t{1_n}, MachineSpecificationDimension::INTER_NODE}, }; SUBCASE("all different devices") { - MachineView mv0 = - MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims}; - MachineView mv1 = - MachineView{MachineSpaceCoordinate{0, 1, DeviceType::GPU}, dims}; - MachineView mv2 = - MachineView{MachineSpaceCoordinate{1, 0, DeviceType::GPU}, dims}; - MachineView mv3 = - MachineView{MachineSpaceCoordinate{1, 1, DeviceType::GPU}, dims}; + MachineView mv0 = MachineView{ + MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU}, dims}; + MachineView mv1 = MachineView{ + MachineSpaceCoordinate{0_n, 1_n, DeviceType::GPU}, dims}; + MachineView mv2 = MachineView{ + MachineSpaceCoordinate{1_n, 0_n, DeviceType::GPU}, dims}; + MachineView mv3 = MachineView{ + MachineSpaceCoordinate{1_n, 1_n, DeviceType::GPU}, dims}; MachineMapping device_mapping = MachineMapping{{ {layer0, mv0}, @@ -174,7 +174,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*forward_op_cost=*/10.0f, /*backward_op_cost=*/10.0f, /*comm_cost=*/1.0f, - /*memory_cost=*/nonnegative_int{0}); + /*memory_cost=*/0_n); float result = task_simulator_estimate_forward_pass_time( pcg, estimator, device_mapping, machine_spec); @@ -187,30 +187,29 @@ TEST_SUITE(FF_TEST_SUITE) { if (op.op_attrs.has()) { return OpCostMetrics{/*forward_runtime=*/10.0f, /*backward_runtime=*/10.0f, - /*memory=*/nonnegative_int{0}}; // layer0 + /*memory=*/0_n}; // layer0 } if (op.op_attrs.has()) { - return OpCostMetrics{ - /*forward_runtime=*/1.0f, - /*backward_runtime=*/1.0f, - /*memory=*/nonnegative_int{0}}; // layers 1, 2 + return OpCostMetrics{/*forward_runtime=*/1.0f, + /*backward_runtime=*/1.0f, + /*memory=*/0_n}; // layers 1, 2 } if (op.op_attrs.has()) { return OpCostMetrics{/*forward_runtime=*/2.0f, /*backward_runtime=*/2.0f, - /*memory=*/nonnegative_int{0}}; // layer3 + /*memory=*/0_n}; // layer3 } return OpCostMetrics{/*forward_runtime=*/0.0f, /*backward_runtime=*/0.0f, - /*memory=*/nonnegative_int{0}}; + /*memory=*/0_n}; }, [](TensorSetMovement const &comm) { return 5.0f; }); } } SUBCASE("all the same device") { - MachineView mv = - MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims}; + MachineView mv = MachineView{ + MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU}, dims}; MachineMapping device_mapping = MachineMapping{{ {layer0, mv}, {layer1, mv}, @@ -222,7 +221,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*forward_op_cost=*/10.0f, /*backward_op_cost=*/10.0f, /*comm_cost=*/1.0f, - /*memory_cost=*/nonnegative_int{0}); + /*memory_cost=*/0_n); float result = task_simulator_estimate_forward_pass_time( pcg, cost_estimator, device_mapping, machine_spec); @@ -235,22 +234,21 @@ TEST_SUITE(FF_TEST_SUITE) { if (op.op_attrs.has()) { return OpCostMetrics{/*forward_runtime=*/10.0f, /*backward_runtime=*/10.0f, - /*memory=*/nonnegative_int{0}}; // layer0 + /*memory=*/0_n}; // layer0 } if (op.op_attrs.has()) { - return OpCostMetrics{ - /*forward_runtime=*/1.0f, - /*backward_runtime=*/1.0f, - /*memory=*/nonnegative_int{0}}; // layers 1, 2 + return OpCostMetrics{/*forward_runtime=*/1.0f, + /*backward_runtime=*/1.0f, + /*memory=*/0_n}; // layers 1, 2 } if (op.op_attrs.has()) { return OpCostMetrics{/*forward_runtime=*/2.0f, /*backward_runtime=*/2.0f, - /*memory=*/nonnegative_int{0}}; // layer3 + /*memory=*/0_n}; // layer3 } return OpCostMetrics{/*forward_runtime=*/0.0f, /*backward_runtime=*/0.0f, - /*memory=*/nonnegative_int{0}}; + /*memory=*/0_n}; }, [](TensorSetMovement const &comm) { return 5.0f; }); float result = task_simulator_estimate_forward_pass_time( diff --git a/lib/compiler/test/src/graph_optimize_state.cc b/lib/compiler/test/src/graph_optimize_state.cc index 46177ad420..0fd9e245a6 100644 --- a/lib/compiler/test/src/graph_optimize_state.cc +++ b/lib/compiler/test/src/graph_optimize_state.cc @@ -11,35 +11,37 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input_shape = ParallelTensorShape{ParallelTensorDims{ FFOrdered{ - ShardParallelDim{32, 2}, - ShardParallelDim{16, 1}, + ShardParallelDim{32_n, 2_n}, + ShardParallelDim{16_n, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT}; parallel_tensor_guid_t input0 = builder.create_input_tensor(input_shape, CreateGrad::YES, "input0"); - parallel_tensor_guid_t dense0 = builder.dense(input0, - 8, - Activation::RELU, - true, - DataType::FLOAT, - std::nullopt, - std::nullopt, - "dense0"); + parallel_tensor_guid_t dense0 = + builder.dense(/*input=*/input0, + /*outDim=*/8_n, + /*activation=*/Activation::RELU, + /*use_bias=*/true, + /*data_type=*/DataType::FLOAT, + /*projection_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt, + /*name=*/"dense0"); - parallel_tensor_guid_t dense1 = builder.dense(dense0, - 4, - Activation::RELU, - true, - DataType::FLOAT, - std::nullopt, - std::nullopt, - "dense1"); + parallel_tensor_guid_t dense1 = + builder.dense(/*input=*/dense0, + /*outDim=*/4_n, + /*activation=*/Activation::RELU, + /*use_bias=*/true, + /*data_type=*/DataType::FLOAT, + /*projection_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt, + /*name=*/"dense1"); ParallelComputationGraph pcg = builder.pcg; @@ -59,14 +61,15 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t input0_ = builder.create_input_tensor(input_shape, CreateGrad::YES, "input0"); - parallel_tensor_guid_t dense0_ = builder.dense(input0, - 8, - Activation::RELU, - true, - DataType::FLOAT, - std::nullopt, - std::nullopt, - "dense0"); + parallel_tensor_guid_t dense0_ = + builder.dense(/*input=*/input0, + /*outDim=*/8_n, + /*activation=*/Activation::RELU, + /*use_bias=*/true, + /*data_type=*/DataType::FLOAT, + /*projection_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt, + /*name=*/"dense0"); ParallelComputationGraph pcg_ = builder.pcg; diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 326c6922f9..57498ee466 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -1,8 +1,9 @@ #ifndef _FLEXFLOW_KERNELS_ARRAY_SHAPE_H #define _FLEXFLOW_KERNELS_ARRAY_SHAPE_H -#include "legion_dim.h" +#include "kernels/legion_dim.h" #include "op-attrs/tensor_shape.dtg.h" +#include "utils/nonnegative_int/nonnegative_int.h" #include "utils/stack_vector/stack_vector.h" #include "utils/visitable.h" #include @@ -14,44 +15,49 @@ namespace FlexFlow { struct ArrayShape { public: ArrayShape() = delete; - ArrayShape(size_t *dims, size_t num_dims); + ArrayShape(nonnegative_int *dims, nonnegative_int num_dims); ArrayShape(TensorShape const &shape); - ArrayShape(std::vector const &); + ArrayShape(std::vector const &); /** * @brief Alias of ArrayShape::num_elements for compatibility with * Legion::Domain */ - std::size_t get_volume() const; + nonnegative_int get_volume() const; /** * @brief Alias of ArrayShape::num_dims for compatibility with Legion::Domain */ - std::size_t get_dim() const; + nonnegative_int get_dim() const; - std::size_t num_elements() const; - std::size_t num_dims() const; + nonnegative_int num_elements() const; + nonnegative_int num_dims() const; - std::size_t operator[](legion_dim_t) const; - std::size_t at(legion_dim_t) const; - std::size_t at(ff_dim_t) const; + nonnegative_int operator[](legion_dim_t) const; + nonnegative_int at(legion_dim_t) const; + nonnegative_int at(ff_dim_t) const; + + bool operator==(ArrayShape const &) const; + bool operator!=(ArrayShape const &) const; legion_dim_t last_idx() const; legion_dim_t neg_idx(int) const; - std::optional at_maybe(legion_dim_t) const; - std::optional at_maybe(ff_dim_t) const; + std::optional at_maybe(legion_dim_t) const; + std::optional at_maybe(ff_dim_t) const; ArrayShape sub_shape(std::optional> start, std::optional> end) const; public: - LegionTensorDims dims; + LegionOrdered dims; + +private: + std::tuple tie() const; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ArrayShape, dims); -size_t get_volume(ArrayShape const &); +nonnegative_int get_volume(ArrayShape const &); TensorShape get_tensor_shape(ArrayShape const &, DataType); diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h index 3fea92c86b..4b89eb1411 100644 --- a/lib/kernels/include/kernels/batch_norm_kernels.h +++ b/lib/kernels/include/kernels/batch_norm_kernels.h @@ -3,46 +3,11 @@ #include "device.h" #include "kernels/allocation.h" +#include "kernels/batch_norm_per_device_state.dtg.h" #include "kernels/ff_handle.h" #include namespace FlexFlow { - -struct BatchNormPerDeviceState { - PerDeviceFFHandle handle; - ffTensorDescriptor_t inputTensor; - ffTensorDescriptor_t outputTensor; - ffTensorDescriptor_t biasTensor; - ffActivationDescriptor_t actiDesc; - ffBatchNormMode_t mode; - float *runningMean; - float *runningVar; - float *saveMean; - float *saveVar; - int output_n; - int output_c; - int output_h; - int output_w; - req relu; -}; - -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(BatchNormPerDeviceState, - handle, - inputTensor, - outputTensor, - biasTensor, - actiDesc, - mode, - runningMean, - runningVar, - saveMean, - saveVar, - output_n, - output_c, - output_h, - output_w, - relu); - namespace Kernels::BatchNorm { BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle, @@ -55,14 +20,16 @@ BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle, bool relu); void forward_kernel(ffStream_t stream, - BatchNormPerDeviceState const &m, + BatchNormPerDeviceState const &per_device_statem, float const *input_ptr, float *output_ptr, float const *scale_ptr, float const *bias_ptr); void backward_kernel(ffStream_t stream, - BatchNormPerDeviceState const &m, + BatchNormPerDeviceState const &per_device_state, + float const *input_ptr, + float *output_grad_ptr, float const *output_ptr, float *output_grad_ptr, float const *input_ptr, diff --git a/lib/kernels/include/kernels/batch_norm_per_device_state.struct.toml b/lib/kernels/include/kernels/batch_norm_per_device_state.struct.toml new file mode 100644 index 0000000000..6d2f04f60c --- /dev/null +++ b/lib/kernels/include/kernels/batch_norm_per_device_state.struct.toml @@ -0,0 +1,68 @@ +namespace = "FlexFlow" +name = "BatchNormPerDeviceState" +features = [] + +includes = [ + "kernels/device.h", + "kernels/ff_handle.h", +] + +[[fields]] +name = "handle" +type = "::FlexFlow::PerDeviceFFHandle" + +[[fields]] +name = "inputTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "outputTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "biasTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "actiDesc" +type = "ffActivationDescriptor_t" + +[[fields]] +name = "mode" +type = "ffBatchNormMode_t" + +[[fields]] +name = "runningMean" +type = "float *" + +[[fields]] +name = "runningVar" +type = "float *" + +[[fields]] +name = "saveMean" +type = "float *" + +[[fields]] +name = "saveVar" +type = "float *" + +[[fields]] +name = "output_n" +type = "int" + +[[fields]] +name = "output_c" +type = "int" + +[[fields]] +name = "output_h" +type = "int" + +[[fields]] +name = "output_w" +type = "int" + +[[fields]] +name = "relu" +type = "bool" diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h index e4dd9723b8..7b9b9c455c 100644 --- a/lib/kernels/include/kernels/legion_dim.h +++ b/lib/kernels/include/kernels/legion_dim.h @@ -8,19 +8,23 @@ namespace FlexFlow { legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value); -legion_dim_t legion_dim_from_ff_dim(ff_dim_t, int num_dimensions); +legion_dim_t legion_dim_from_ff_dim(ff_dim_t, nonnegative_int num_dimensions); template using LegionOrdered = DimOrdered; -using LegionTensorDims = LegionOrdered; - template FFOrdered ff_ordered_from_legion_ordered(LegionOrdered const &legion_ordered) { return FFOrdered(legion_ordered.rbegin(), legion_ordered.rend()); } +template +LegionOrdered + legion_ordered_from_ff_ordered(FFOrdered const &ff_ordered) { + return LegionOrdered(ff_ordered.rbegin(), ff_ordered.rend()); +} + template std::string format_as(LegionOrdered const &v) { std::vector as_vec(v.cbegin(), v.cend()); diff --git a/lib/kernels/include/kernels/legion_dim_t.struct.toml b/lib/kernels/include/kernels/legion_dim_t.struct.toml index d2afb0d73f..6c047f096b 100644 --- a/lib/kernels/include/kernels/legion_dim_t.struct.toml +++ b/lib/kernels/include/kernels/legion_dim_t.struct.toml @@ -1,6 +1,5 @@ namespace = "FlexFlow" name = "legion_dim_t" - features = [ "eq", "ord", @@ -9,6 +8,10 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + [[fields]] name = "value" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/kernels/include/kernels/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml index f99ff10bb9..0171e3e497 100644 --- a/lib/kernels/include/kernels/per_device_op_state.variant.toml +++ b/lib/kernels/include/kernels/per_device_op_state.variant.toml @@ -19,7 +19,6 @@ includes = [ "kernels/reshape_kernels.h", "kernels/softmax_kernels.h", "kernels/topk_kernels.h", - "kernels/transpose_kernels.h", ] [[values]] @@ -81,7 +80,3 @@ key = "softmax_per_device_state" [[values]] type = "::FlexFlow::TopKPerDeviceState" key = "topk_per_device_state" - -[[values]] -type = "::FlexFlow::TransposePerDeviceState" -key = "transpose_per_device_state" diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h index dbf78826cb..0ed10ac03d 100644 --- a/lib/kernels/include/kernels/transpose_kernels.h +++ b/lib/kernels/include/kernels/transpose_kernels.h @@ -3,31 +3,20 @@ #include "device.h" #include "kernels/accessor.h" +#include "op-attrs/ops/transpose_attrs.dtg.h" #include namespace FlexFlow { -struct TransposePerDeviceState { - int num_dim; - req> perm; -}; - -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(TransposePerDeviceState, - num_dim, - perm); - namespace Kernels::Transpose { -TransposePerDeviceState init_kernel(int num_dim, - std::vector const &perm); - void forward_kernel(cudaStream_t stream, - TransposePerDeviceState const &m, + TransposeAttrs const &attrs, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); void backward_kernel(cudaStream_t stream, - TransposePerDeviceState const &m, + TransposeAttrs const &attrs, GenericTensorAccessorR const &out_grad, GenericTensorAccessorW const &in_grad); diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index 733146851a..bed8daba51 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -17,7 +17,8 @@ DeviceType Allocator::get_allocation_device_type() const { GenericTensorAccessorW Allocator::allocate_tensor(TensorShape const &tensor_shape) { - void *ptr = this->allocate(get_size_in_bytes(tensor_shape)); + void *ptr = + this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative()); return {tensor_shape.data_type, tensor_shape, ptr, diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 5c18a9ab5a..54534f2ccf 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -1,55 +1,64 @@ #include "kernels/array_shape.h" #include "utils/containers/product.h" +#include "utils/containers/reversed.h" +#include "utils/containers/vector_of.h" +#include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { -static LegionTensorDims - legion_dims_from_ff_dims(FFOrdered const &ff_ordered) { - std::vector sizes(ff_ordered.size()); - std::reverse_copy(ff_ordered.begin(), ff_ordered.end(), sizes.begin()); - return LegionTensorDims(sizes.begin(), sizes.end()); +static LegionOrdered + legion_dims_from_ff_dims(FFOrdered const &ff_ordered) { + return LegionOrdered{reversed(vector_of(ff_ordered))}; } -ArrayShape::ArrayShape(size_t *_dims, size_t num_dims) - : dims(_dims, _dims + num_dims) {} +ArrayShape::ArrayShape(nonnegative_int *_dims, nonnegative_int num_dims) + : dims(_dims, _dims + num_dims.unwrap_nonnegative()) {} ArrayShape::ArrayShape(TensorShape const &shape) : dims(legion_dims_from_ff_dims(shape.dims.ff_ordered)) {} -ArrayShape::ArrayShape(std::vector const &input_dims) +ArrayShape::ArrayShape(std::vector const &input_dims) : dims(input_dims) {} -std::size_t ArrayShape::get_volume() const { +nonnegative_int ArrayShape::get_volume() const { return this->num_elements(); } -std::size_t ArrayShape::num_dims() const { - return this->dims.size(); +nonnegative_int ArrayShape::num_dims() const { + return ::FlexFlow::num_elements(this->dims); } -std::size_t ArrayShape::get_dim() const { +nonnegative_int ArrayShape::get_dim() const { return this->num_dims(); } -std::size_t ArrayShape::num_elements() const { +nonnegative_int ArrayShape::num_elements() const { if (dims.size() == 0) { - return 0; + return 0_n; } return product(this->dims); } -std::size_t ArrayShape::operator[](legion_dim_t idx) const { +nonnegative_int ArrayShape::operator[](legion_dim_t idx) const { return dims.at(idx); } -std::size_t ArrayShape::at(legion_dim_t idx) const { +nonnegative_int ArrayShape::at(legion_dim_t idx) const { return dims.at(idx); } -std::size_t ArrayShape::at(ff_dim_t idx) const { +nonnegative_int ArrayShape::at(ff_dim_t idx) const { return dims.at(legion_dim_from_ff_dim(idx, this->num_dims())); } +bool ArrayShape::operator==(ArrayShape const &other) const { + return this->tie() == other.tie(); +} + +bool ArrayShape::operator!=(ArrayShape const &other) const { + return this->tie() != other.tie(); +} + ArrayShape ArrayShape::sub_shape( std::optional> start, std::optional> end) const { @@ -57,7 +66,7 @@ ArrayShape ArrayShape::sub_shape( NOT_IMPLEMENTED(); } -std::optional ArrayShape::at_maybe(legion_dim_t index) const { +std::optional ArrayShape::at_maybe(legion_dim_t index) const { if (index.value < dims.size()) { return dims.at(index); } else { @@ -65,11 +74,15 @@ std::optional ArrayShape::at_maybe(legion_dim_t index) const { } } -std::optional ArrayShape::at_maybe(ff_dim_t index) const { +std::optional ArrayShape::at_maybe(ff_dim_t index) const { return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims())); } -size_t get_volume(ArrayShape const &shape) { +std::tuple const &> ArrayShape::tie() const { + return std::tie(this->dims); +} + +nonnegative_int get_volume(ArrayShape const &shape) { return shape.get_volume(); } diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu index b30cf6a663..4669955019 100644 --- a/lib/kernels/src/cuda/cuda_helper.cu +++ b/lib/kernels/src/cuda/cuda_helper.cu @@ -224,10 +224,10 @@ ffStatus_t tensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, - shape.at_maybe(legion_dim_t{0}).value_or(1), - shape.at_maybe(legion_dim_t{1}).value_or(1), - shape.at_maybe(legion_dim_t{2}).value_or(1), - shape.at_maybe(legion_dim_t{3}).value_or(1)); + shape.at_maybe(legion_dim_t{0_n}).value_or(1_n).unwrap_nonnegative(), + shape.at_maybe(legion_dim_t{1_n}).value_or(1_n).unwrap_nonnegative(), + shape.at_maybe(legion_dim_t{2_n}).value_or(1_n).unwrap_nonnegative(), + shape.at_maybe(legion_dim_t{3_n}).value_or(1_n).unwrap_nonnegative()); } cudnnDataType_t ff_to_cudnn_datatype(DataType type) { diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu index 512981e32b..98c528cd7b 100644 --- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu +++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu @@ -145,21 +145,23 @@ BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle, actiDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0)); } - BatchNormPerDeviceState per_device_state = {handle, - inputTensor, - outputTensor, - biasTensor, - actiDesc, - mode, - runningMean, - runningVar, - saveMean, - saveVar, - output_n, - output_c, - output_h, - output_w, - relu}; + BatchNormPerDeviceState per_device_state = BatchNormPerDeviceState{ + handle, + inputTensor, + outputTensor, + biasTensor, + actiDesc, + mode, + runningMean, + runningVar, + saveMean, + saveVar, + output_n, + output_c, + output_h, + output_w, + relu, + }; checkCUDA(cudaStreamDestroy(stream)); return per_device_state; diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu index afc3e1f7ef..230ca70627 100644 --- a/lib/kernels/src/cuda/ops/cast_kernels.cu +++ b/lib/kernels/src/cuda/ops/cast_kernels.cu @@ -41,7 +41,7 @@ struct ForwardKernel { void operator()(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - size_t volume = input.shape.get_volume(); + size_t volume = input.shape.get_volume().unwrap_nonnegative(); cast_forward<<>>( input.get(), output.get(), volume); } @@ -52,7 +52,7 @@ struct BackwardKernel { void operator()(ffStream_t stream, GenericTensorAccessorR const &output, GenericTensorAccessorW const &input) { - size_t volume = output.shape.get_volume(); + size_t volume = output.shape.get_volume().unwrap_nonnegative(); cast_backward<<>>( output.get(), input.get(), volume, cast_to(1.0f)); } diff --git a/lib/kernels/src/cuda/ops/combine_kernels.cu b/lib/kernels/src/cuda/ops/combine_kernels.cu index 98c01d1f7c..7cc67ceed8 100644 --- a/lib/kernels/src/cuda/ops/combine_kernels.cu +++ b/lib/kernels/src/cuda/ops/combine_kernels.cu @@ -29,7 +29,8 @@ struct ForwardKernel { GenericTensorAccessorW const &output) { checkCUDA(cudaMemcpyAsync(output.get
(), input.get
(), - input.shape.get_volume() * size_of_datatype(DT), + input.shape.get_volume().unwrap_nonnegative() * + size_of_datatype(DT).unwrap_nonnegative(), cudaMemcpyDeviceToDevice, stream)); } @@ -40,7 +41,7 @@ struct BackwardKernel { void operator()(ffStream_t stream, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad) { - size_t num_elements = output_grad.shape.get_volume(); + size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative(); add_kernel> <<>>( input_grad.get
(), output_grad.get
(), num_elements); diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu index ad216feda2..aa442f5c3d 100644 --- a/lib/kernels/src/cuda/ops/concat_kernels.cu +++ b/lib/kernels/src/cuda/ops/concat_kernels.cu @@ -23,8 +23,11 @@ void calc_blk_size(size_t &num_blocks, size_t &blk_size, ArrayShape const &shape, ff_dim_t axis) { - blk_size = shape.sub_shape(legion_dim_t{0}, axis).num_elements(); - num_blocks = shape.sub_shape(axis, std::nullopt).num_elements(); + blk_size = shape.sub_shape(legion_dim_t{0_n}, axis) + .num_elements() + .unwrap_nonnegative(); + num_blocks = + shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative(); } void forward_kernel(cudaStream_t stream, diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu index 0a4024ba8a..32e749e15a 100644 --- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu @@ -137,15 +137,15 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, ffConvolutionBwdFilterAlgo_t bwdFilterAlgo; ffConvolutionBwdDataAlgo_t bwdDataAlgo; - int input_w = input.shape[legion_dim_t(0)]; - int input_h = input.shape[legion_dim_t(1)]; - int input_c = input.shape[legion_dim_t(2)]; - int input_n = input.shape[legion_dim_t(3)]; + int input_w = input.shape.at(legion_dim_t(0_n)).unwrap_nonnegative(); + int input_h = input.shape.at(legion_dim_t(1_n)).unwrap_nonnegative(); + int input_c = input.shape.at(legion_dim_t(2_n)).unwrap_nonnegative(); + int input_n = input.shape.at(legion_dim_t(3_n)).unwrap_nonnegative(); - int output_w = output.shape[legion_dim_t(0)]; - int output_h = output.shape[legion_dim_t(1)]; - int output_c = output.shape[legion_dim_t(2)]; - int output_n = output.shape[legion_dim_t(3)]; + int output_w = output.shape.at(legion_dim_t(0_n)).unwrap_nonnegative(); + int output_h = output.shape.at(legion_dim_t(1_n)).unwrap_nonnegative(); + int output_c = output.shape.at(legion_dim_t(2_n)).unwrap_nonnegative(); + int output_n = output.shape.at(legion_dim_t(3_n)).unwrap_nonnegative(); checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor)); diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu index 687a9fa220..e096803682 100644 --- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu +++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu @@ -266,7 +266,7 @@ struct ForwardKernel { output.get())); } else if (use_scalar(op_type)) { assert(scalar.has_value()); - size_t num_elements = input.shape.num_elements(); + size_t num_elements = input.shape.num_elements().unwrap_nonnegative(); elewise_scalar_unary_forward_kernel> <<>>( num_elements, @@ -275,7 +275,7 @@ struct ForwardKernel { input.get(), output.get()); } else { - size_t num_elements = input.shape.num_elements(); + size_t num_elements = input.shape.num_elements().unwrap_nonnegative(); elewise_unary_forward_kernel> <<>>( num_elements, op_type, input.get(), output.get()); @@ -312,7 +312,7 @@ struct BackwardKernel { input_grad.get())); } else if (use_scalar(op_type)) { assert(scalar.has_value()); - size_t num_elements = input.shape.num_elements(); + size_t num_elements = input.shape.num_elements().unwrap_nonnegative(); elewise_scalar_unary_backward_kernel> <<>>( num_elements, @@ -323,7 +323,7 @@ struct BackwardKernel { input.get(), input_grad.get()); } else { - size_t num_elements = input.shape.num_elements(); + size_t num_elements = input.shape.num_elements().unwrap_nonnegative(); elewise_unary_backward_kernel> <<>>( num_elements, diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu index f661e5fb0a..14bb3bddd1 100644 --- a/lib/kernels/src/cuda/ops/flat_kernels.cu +++ b/lib/kernels/src/cuda/ops/flat_kernels.cu @@ -27,7 +27,8 @@ void forward_kernel(cudaStream_t stream, checkCUDA(cudaMemcpyAsync(output_ptr, input.get_float_ptr(), - (input.shape.num_elements()) * sizeof(float), + input.shape.num_elements().unwrap_nonnegative() * + sizeof(float), cudaMemcpyDeviceToDevice, stream)); } @@ -39,8 +40,13 @@ void backward_kernel(cudaStream_t stream, float alpha = 1.0f; apply_add_with_scale - <<>>( - input_grad_ptr, output_grad_ptr, input.shape.num_elements(), alpha); + <<>>(input_grad_ptr, + output_grad_ptr, + input.shape.num_elements().unwrap_nonnegative(), + alpha); } } // namespace Flat diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu index 11c0a1a5e7..31c1bac217 100644 --- a/lib/kernels/src/cuda/ops/gather_kernels.cu +++ b/lib/kernels/src/cuda/ops/gather_kernels.cu @@ -128,22 +128,24 @@ void forward_kernel(ffStream_t stream, coord_t stride = output.shape.sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1)) - .num_elements(); - coord_t output_dim_size = output.shape[m.legion_dim]; - coord_t input_dim_size = input.shape[m.legion_dim]; + .num_elements() + .unwrap_nonnegative(); + coord_t output_dim_size = output.shape.at(m.legion_dim).unwrap_nonnegative(); + coord_t input_dim_size = input.shape.at(m.legion_dim).unwrap_nonnegative(); assert(index.data_type == DataType::INT32 || index.data_type == DataType::INT64); - DataTypeDispatch1{}(index.data_type, - stream, - input, - index, - output, - output.shape.get_volume(), - stride, - input_dim_size, - output_dim_size); + DataTypeDispatch1{}( + index.data_type, + stream, + input, + index, + output, + output.shape.get_volume().unwrap_nonnegative(), + stride, + input_dim_size, + output_dim_size); } void backward_kernel(ffStream_t stream, @@ -156,22 +158,26 @@ void backward_kernel(ffStream_t stream, coord_t stride = output_grad.shape .sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1)) - .get_volume(); - coord_t output_dim_size = output_grad.shape[m.legion_dim]; - coord_t input_dim_size = input_grad.shape[m.legion_dim]; + .get_volume() + .unwrap_nonnegative(); + coord_t output_dim_size = + output_grad.shape.at(m.legion_dim).unwrap_nonnegative(); + coord_t input_dim_size = + input_grad.shape.at(m.legion_dim).unwrap_nonnegative(); assert(index.data_type == DataType::INT32 || index.data_type == DataType::INT64); - DataTypeDispatch1{}(index.data_type, - stream, - output_grad, - index, - input_grad, - output_grad.shape.get_volume(), - stride, - input_dim_size, - output_dim_size); + DataTypeDispatch1{}( + index.data_type, + stream, + output_grad, + index, + input_grad, + output_grad.shape.get_volume().unwrap_nonnegative(), + stride, + input_dim_size, + output_dim_size); } } // namespace Gather diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu index 3687c1cedf..67d5c25c3b 100644 --- a/lib/kernels/src/cuda/ops/partition_kernels.cu +++ b/lib/kernels/src/cuda/ops/partition_kernels.cu @@ -29,7 +29,8 @@ struct ForwardKernel { GenericTensorAccessorW const &output) { checkCUDA(cudaMemcpyAsync(output.get(), input.get(), - input.shape.num_elements() * size_of_datatype(T), + input.shape.num_elements().unwrap_nonnegative() * + size_of_datatype(T).unwrap_nonnegative(), cudaMemcpyDeviceToDevice, stream)); } @@ -41,12 +42,13 @@ struct BackwardKernel { RepartitionPerDeviceState const &m, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad) { - add_kernel><<>>(input_grad.get(), - output_grad.get(), - input_grad.shape.num_elements()); + add_kernel> + <<>>(input_grad.get(), + output_grad.get(), + input_grad.shape.num_elements().unwrap_nonnegative()); } }; diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu index 9c3e8dcc40..0ef7e304cf 100644 --- a/lib/kernels/src/cuda/ops/reduction_kernels.cu +++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu @@ -41,12 +41,13 @@ struct ForwardKernel { GenericTensorAccessorW const &output, size_t num_replicas) { - size_t total_elements = input.shape.num_elements() * num_replicas; + size_t total_elements = + input.shape.num_elements().unwrap_nonnegative() * num_replicas; reduction_forward_kernel> <<>>( input.get(), output.get(), - input.shape.num_elements(), + input.shape.num_elements().unwrap_nonnegative(), num_replicas); } }; @@ -58,7 +59,8 @@ struct BackwardKernel { GenericTensorAccessorW const &input) { checkCUDA(cudaMemcpyAsync(input.get(), output.get(), - input.shape.num_elements() * size_of_datatype(T), + input.shape.num_elements().unwrap_nonnegative() * + size_of_datatype(T).unwrap_nonnegative(), cudaMemcpyDeviceToDevice, stream)); } diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu index 1aa61375f0..b4fa5edb89 100644 --- a/lib/kernels/src/cuda/ops/replicate_kernels.cu +++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu @@ -41,7 +41,8 @@ struct ForwardKernel { checkCUDA(cudaMemcpyAsync((void *)output.get(), (void *)input.get(), - input.shape.num_elements() * size_of_datatype(T), + input.shape.num_elements().unwrap_nonnegative() * + size_of_datatype(T).unwrap_nonnegative(), cudaMemcpyDeviceToDevice, stream)); } @@ -53,12 +54,13 @@ struct BackwardKernel { GenericTensorAccessorR const &output, GenericTensorAccessorW const &input, size_t num_replicas) { - size_t total_elements = input.shape.num_elements() * num_replicas; + size_t total_elements = + input.shape.num_elements().unwrap_nonnegative() * num_replicas; replicate_backward_kernel> <<>>( input.get(), output.get(), - input.shape.num_elements(), + input.shape.num_elements().unwrap_nonnegative(), num_replicas); } }; diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu index b7a328ca08..374dfb22ba 100644 --- a/lib/kernels/src/cuda/ops/reshape_kernels.cu +++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu @@ -33,7 +33,8 @@ struct ForwardKernel { GenericTensorAccessorW const &output) { checkCUDA(cudaMemcpyAsync(output.get(), input.get(), - input.shape.num_elements() * size_of_datatype(T), + input.shape.num_elements().unwrap_nonnegative() * + size_of_datatype(T).unwrap_nonnegative(), cudaMemcpyDeviceToDevice, stream)); } @@ -46,12 +47,12 @@ struct BackwardKernel { GenericTensorAccessorW const &input) { float alpha = 1.0f; apply_add_with_scale> - <<>>(input.get(), output.get(), - input.shape.num_elements(), + input.shape.num_elements().unwrap_nonnegative(), static_cast>(alpha)); } }; diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu index 37e1a08326..e1aaacc7f9 100644 --- a/lib/kernels/src/cuda/ops/transpose_kernels.cu +++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu @@ -16,7 +16,9 @@ #include "device.h" #include "kernels/accessor.h" #include "kernels/transpose_kernels.h" +#include "op-attrs/dim_ordered/transform.h" #include "utils/exception.h" +#include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { @@ -29,19 +31,6 @@ struct TransposeStrides { namespace Kernels { namespace Transpose { -TransposePerDeviceState init_kernel(int num_dim, - std::vector const &perm) { - int const length = perm.size(); - - std::vector perm_vector; - assert(length <= MAX_TENSOR_DIM); - for (int i = 0; i < length; ++i) { - perm_vector.push_back(legion_dim_from_ff_dim(perm[i], num_dim)); - } - - return {num_dim, perm_vector}; -} - __global__ void transpose_simple_kernel(std::size_t volume, float const *in_ptr, float *out_ptr, @@ -59,64 +48,92 @@ __global__ void transpose_simple_kernel(std::size_t volume, } } +static LegionOrdered + legion_ordered_perm_from_ff_ordered(FFOrdered const &perm) { + nonnegative_int perm_size = num_elements(perm); + LegionOrdered legion_ordered_perm = + transform(legion_ordered_from_ff_ordered(perm), [&](ff_dim_t d) { + return legion_dim_from_ff_dim(d, perm_size); + }); + + return legion_ordered_perm; +} + void forward_kernel(cudaStream_t stream, - TransposePerDeviceState const &m, + TransposeAttrs const &m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { TransposeStrides info; - info.num_dim = input.shape.num_dims(); - assert(info.num_dim == m.num_dim); + info.num_dim = input.shape.num_dims().unwrap_nonnegative(); + assert(info.num_dim == m.perm.size()); + + LegionOrdered legion_ordered_perm = + legion_ordered_perm_from_ff_ordered(m.perm); + for (int i = 0; i < info.num_dim; i++) { if (i == 0) { info.in_strides[i] = 1; info.out_strides[i] = 1; } else { - int in_dim_size = input.shape[legion_dim_t(i)] + 1; - int out_dim_size = output.shape[legion_dim_t(i)] + 1; + int in_dim_size = + input.shape.at(legion_dim_t{nonnegative_int{i}}).unwrap_nonnegative(); + int out_dim_size = output.shape.at(legion_dim_t{nonnegative_int{i}}) + .unwrap_nonnegative(); info.in_strides[i] = info.in_strides[i - 1] * in_dim_size; info.out_strides[i] = info.out_strides[i - 1] * out_dim_size; } - info.perm[i] = m.perm[i].value; + + info.perm[i] = legion_ordered_perm.at(legion_dim_t{nonnegative_int{i}}) + .value.unwrap_nonnegative(); } - transpose_simple_kernel<<>>(output.shape.get_volume(), - input.get_float_ptr(), - output.get_float_ptr(), - info, - 0.0f /*beta*/); + transpose_simple_kernel<<< + GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()), + CUDA_NUM_THREADS, + 0, + stream>>>(output.shape.get_volume().unwrap_nonnegative(), + input.get_float_ptr(), + output.get_float_ptr(), + info, + 0.0f /*beta*/); } void backward_kernel(cudaStream_t stream, - TransposePerDeviceState const &m, + TransposeAttrs const &m, GenericTensorAccessorR const &out_grad, GenericTensorAccessorW const &in_grad) { TransposeStrides info; - info.num_dim = in_grad.shape.num_dims(); - assert(info.num_dim == m.num_dim); + info.num_dim = in_grad.shape.num_dims().unwrap_nonnegative(); + assert(info.num_dim == m.perm.size()); + + LegionOrdered legion_ordered_perm = + legion_ordered_perm_from_ff_ordered(m.perm); + for (int i = 0; i < info.num_dim; i++) { if (i == 0) { info.in_strides[i] = 1; info.out_strides[i] = 1; } else { - int in_dim_size = out_grad.shape[legion_dim_t(i)] + 1; - int out_dim_size = in_grad.shape[legion_dim_t(i)] + 1; + int in_dim_size = out_grad.shape.at(legion_dim_t{nonnegative_int{i}}) + .unwrap_nonnegative(); + int out_dim_size = in_grad.shape.at(legion_dim_t{nonnegative_int{i}}) + .unwrap_nonnegative(); info.in_strides[i] = info.in_strides[i - 1] * in_dim_size; info.out_strides[i] = info.out_strides[i - 1] * out_dim_size; } - info.perm[m.perm[i].value] = i; + info.perm[legion_ordered_perm.at(legion_dim_t{nonnegative_int{i}}) + .value.unwrap_nonnegative()] = i; } - transpose_simple_kernel<<>>(in_grad.shape.get_volume(), - out_grad.get_float_ptr(), - in_grad.get_float_ptr(), - info, - 1.0f /*beta*/); + transpose_simple_kernel<<< + GET_BLOCKS(in_grad.shape.get_volume().unwrap_nonnegative()), + CUDA_NUM_THREADS, + 0, + stream>>>(in_grad.shape.get_volume().unwrap_nonnegative(), + out_grad.get_float_ptr(), + in_grad.get_float_ptr(), + info, + 1.0f /*beta*/); } } // namespace Transpose diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc index 142dcbcb2c..bbb15c5636 100644 --- a/lib/kernels/src/legion_dim.cc +++ b/lib/kernels/src/legion_dim.cc @@ -3,11 +3,14 @@ namespace FlexFlow { legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) { - return legion_dim_t(legion_dim.value + value); + return legion_dim_t{ + nonnegative_int{legion_dim.value.unwrap_nonnegative() + value}}; } -legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, int num_dimensions) { - return legion_dim_t(num_dimensions - ff_dim.value.get_value() - 1); +legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, + nonnegative_int num_dimensions) { + return legion_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() - + ff_dim.value.unwrap_nonnegative() - 1}}; } } // namespace FlexFlow diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index 023233ecb0..bd0167a677 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -6,11 +6,17 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test multi-head attention kernel") { - size_t num_samples = 10; - size_t num_heads = 4; - size_t qSize = 64, kSize = 64, vSize = 64; - size_t qProjSize = 64, kProjSize = 64, vProjSize = 64, oProjSize = 64; - size_t qoSeqLength = 20, kvSeqLength = 20; + nonnegative_int num_samples = 10_n; + nonnegative_int num_heads = 4_n; + nonnegative_int qSize = 64_n; + nonnegative_int kSize = 64_n; + nonnegative_int vSize = 64_n; + nonnegative_int qProjSize = 64_n; + nonnegative_int kProjSize = 64_n; + nonnegative_int vProjSize = 64_n; + nonnegative_int oProjSize = 64_n; + nonnegative_int qoSeqLength = 20_n; + nonnegative_int kvSeqLength = 20_n; ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle{ @@ -19,21 +25,21 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); - MHAPerDeviceState state = - Kernels::MultiHeadAttention::init_kernel(managed_handle.raw_handle(), - allocator, - num_samples, - num_heads, - qSize, - kSize, - vSize, - qProjSize, - kProjSize, - vProjSize, - oProjSize, - qoSeqLength, - kvSeqLength, - false); + MHAPerDeviceState state = Kernels::MultiHeadAttention::init_kernel( + managed_handle.raw_handle(), + allocator, + /*num_samples=*/num_samples.unwrap_nonnegative(), + /*num_heads=*/num_heads.unwrap_nonnegative(), + /*qSize=*/qSize.unwrap_nonnegative(), + /*kSize=*/kSize.unwrap_nonnegative(), + /*vSize=*/vSize.unwrap_nonnegative(), + /*qProjSize=*/qProjSize.unwrap_nonnegative(), + /*kProjSize=*/kProjSize.unwrap_nonnegative(), + /*vProjSize=*/vProjSize.unwrap_nonnegative(), + /*oProjSize=*/oProjSize.unwrap_nonnegative(), + /*qoSeqLength=*/qoSeqLength.unwrap_nonnegative(), + /*kvSeqLength=*/kvSeqLength.unwrap_nonnegative(), + /*add_bias_kv=*/false); TensorShape query_shape = make_tensor_shape_from_legion_dims( {qoSeqLength, num_samples, qSize}, DataType::FLOAT); @@ -43,8 +49,8 @@ TEST_SUITE(FF_TEST_SUITE) { {kvSeqLength, num_samples, vSize}, DataType::FLOAT); TensorShape output_shape = make_tensor_shape_from_legion_dims( {qoSeqLength, num_samples, oProjSize}, DataType::FLOAT); - TensorShape weight_shape = - make_tensor_shape_from_legion_dims({state.weightSize}, DataType::FLOAT); + TensorShape weight_shape = make_tensor_shape_from_legion_dims( + {nonnegative_int{state.weightSize}}, DataType::FLOAT); GenericTensorAccessorW query_accessor = create_random_filled_accessor_w(query_shape, allocator); diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index 8a11a069f5..d78d5daee5 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -6,13 +6,13 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test BatchMatmul Kernel") { - size_t m = 10; - size_t n = 10; - size_t k = 10; - size_t batch = 5; - size_t a_seq_length_dim = -1; - size_t b_seq_length_dim = -1; - size_t seq_length = -1; + nonnegative_int m = 10_n; + nonnegative_int n = 10_n; + nonnegative_int k = 10_n; + nonnegative_int batch = 5_n; + int a_seq_length_dim = -1; + int b_seq_length_dim = -1; + int seq_length = -1; ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle{ @@ -41,10 +41,10 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor.get_float_ptr(), a_accessor.get_float_ptr(), b_accessor.get_float_ptr(), - m, - n, - k, - batch, + m.unwrap_nonnegative(), + n.unwrap_nonnegative(), + k.unwrap_nonnegative(), + batch.unwrap_nonnegative(), a_seq_length_dim, b_seq_length_dim, seq_length); @@ -66,10 +66,10 @@ TEST_SUITE(FF_TEST_SUITE) { a_grad_accessor.get_float_ptr(), b_accessor.get_float_ptr(), b_grad_accessor.get_float_ptr(), - m, - n, - k, - batch); + m.unwrap_nonnegative(), + n.unwrap_nonnegative(), + k.unwrap_nonnegative(), + batch.unwrap_nonnegative()); } } } diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 270fad7bb6..d0ec2559ba 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -7,7 +7,10 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test BatchNorm Kernel") { - size_t output_n = 1, output_c = 10, output_h = 10, output_w = 10; + nonnegative_int output_n = 1_n; + nonnegative_int output_c = 10_n; + nonnegative_int output_h = 10_n; + nonnegative_int output_w = 10_n; ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle{ @@ -16,15 +19,15 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); - BatchNormPerDeviceState state = - Kernels::BatchNorm::init_kernel(managed_handle.raw_handle(), - allocator, - nullptr, - output_n, - output_c, - output_h, - output_w, - true); + BatchNormPerDeviceState state = Kernels::BatchNorm::init_kernel( + /*handle=*/managed_handle.raw_handle(), + /*allocator=*/allocator, + /*runningMean=*/nullptr, + /*output_n=*/output_n.unwrap_nonnegative(), + /*output_c=*/output_c.unwrap_nonnegative(), + /*output_h=*/output_h.unwrap_nonnegative(), + /*output_w=*/output_w.unwrap_nonnegative(), + /*relu=*/true); TensorShape input_shape = make_tensor_shape_from_legion_dims( {output_n, output_c, output_h, output_w}, DataType::FLOAT); @@ -46,12 +49,13 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW bias_accessor = create_filled_accessor_w( bias_shape, allocator, make_float_data_type_value(0)); - Kernels::BatchNorm::forward_kernel(managed_stream.raw_stream(), - state, - input_accessor.get_float_ptr(), - output_accessor.get_float_ptr(), - scale_accessor.get_float_ptr(), - bias_accessor.get_float_ptr()); + Kernels::BatchNorm::forward_kernel( + /*stream=*/managed_stream.raw_stream(), + /*per_device_state=*/state, + /*input_ptr=*/input_accessor.get_float_ptr(), + /*output_ptr=*/output_accessor.get_float_ptr(), + /*scale_ptr=*/scale_accessor.get_float_ptr(), + /*bias_ptr=*/bias_accessor.get_float_ptr()); CHECK(contains_non_zero(output_accessor)); } @@ -66,16 +70,18 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW bias_grad_accessor = create_random_filled_accessor_w(bias_shape, allocator); - Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(), - state, - output_accessor.get_float_ptr(), - output_grad_accessor.get_float_ptr(), - input_accessor.get_float_ptr(), - input_grad_accessor.get_float_ptr(), - scale_accessor.get_float_ptr(), - scale_grad_accessor.get_float_ptr(), - bias_grad_accessor.get_float_ptr(), - input_accessor.shape.num_elements()); + Kernels::BatchNorm::backward_kernel( + /*stream=*/managed_stream.raw_stream(), + /*per_device_state=*/state, + /*output_ptr=*/output_accessor.get_float_ptr(), + /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(), + /*input_ptr=*/input_accessor.get_float_ptr(), + /*input_grad_ptr=*/input_grad_accessor.get_float_ptr(), + /*scale_ptr=*/scale_accessor.get_float_ptr(), + /*scale_grad_ptr=*/scale_grad_accessor.get_float_ptr(), + /*bias_grad_ptr=*/bias_grad_accessor.get_float_ptr(), + /*numElements=*/ + input_accessor.shape.num_elements().unwrap_nonnegative()); CHECK(contains_non_zero(input_grad_accessor)); CHECK(contains_non_zero(scale_grad_accessor)); diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 1be5839a9c..2ac27a9747 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -11,9 +11,9 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({100_n, 100_n}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({100, 100}, DataType::DOUBLE); + make_tensor_shape_from_legion_dims({100_n, 100_n}, DataType::DOUBLE); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index a4688a1030..91f42669eb 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({100_n, 100_n}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 5447b12fc5..3587cecedd 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -6,9 +6,9 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test concat kernel forward and backward") { - size_t num_inputs = 2; - size_t size_per_input = 10; - ff_dim_t concat_axis = ff_dim_t{nonnegative_int{1}}; + nonnegative_int num_inputs = 2_n; + nonnegative_int size_per_input = 10_n; + ff_dim_t concat_axis = ff_dim_t{1_n}; ManagedPerDeviceFFHandle managed_handle{ /*workSpaceSize=*/1024 * 1024, @@ -24,7 +24,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { std::vector input_accessors = - repeat(num_inputs, [&]() { + repeat(num_inputs, [&]() { return read_only_accessor_from_write_accessor( create_random_filled_accessor_w(input_shape, allocator)); }); diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 4be2bdf7bb..ad74fa7d36 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -10,11 +10,11 @@ TEST_SUITE(FF_TEST_SUITE) { float dropout_rate = 0.1; ArrayShape shape = ArrayShape{ - std::vector{10, 10}, + std::vector{10_n, 10_n}, }; TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({10_n, 10_n}, DataType::FLOAT); TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index bbeb349ced..238c4ac361 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) { ManagedFFStream managed_stream{}; TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); TensorShape output_shape = input_shape; GenericTensorAccessorR input_accessor = diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 7f97563217..b75614588c 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -12,12 +12,13 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); - GatherPerDeviceState state = {managed_handle.raw_handle(), legion_dim_t(2)}; + GatherPerDeviceState state = {managed_handle.raw_handle(), + legion_dim_t{2_n}}; TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({50}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({50_n}, DataType::FLOAT); GenericTensorAccessorR index_accessor = create_random_filled_accessor_r(output_shape, allocator); diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 80a046fe37..8368fe4efd 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -7,8 +7,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test LayerNorm Forward and Backward Kernel") { - size_t batch_size = 10; - size_t feature_size = 10; + nonnegative_int batch_size = 10_n; + nonnegative_int feature_size = 10_n; float epsilon = 1e-5f; bool elementwise_affine = true; @@ -29,8 +29,8 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::LayerNorm::init_kernel(managed_handle.raw_handle(), allocator, elementwise_affine, - batch_size, - feature_size, + batch_size.unwrap_nonnegative(), + feature_size.unwrap_nonnegative(), epsilon); GenericTensorAccessorR input_accessor = diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 25264b7a58..c1be78bd16 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_handle.raw_handle(), DataType::FLOAT); TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({10_n, 10_n}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index eb0702a970..ff74f6fb28 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -6,10 +6,20 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Pool2D Forward and Backward Kernel") { - size_t input_w = 10, input_h = 10, input_c = 3, input_n = 1; - size_t output_w = 5, output_h = 5, output_c = 3, output_n = 1; - size_t pad_h = 0, pad_w = 0, kernel_h = 2, kernel_w = 2, stride_h = 2, - stride_w = 2; + nonnegative_int input_w = 10_n; + nonnegative_int input_h = 10_n; + nonnegative_int input_c = 3_n; + nonnegative_int input_n = 1_n; + nonnegative_int output_w = 5_n; + nonnegative_int output_h = 5_n; + nonnegative_int output_c = 3_n; + nonnegative_int output_n = 1_n; + nonnegative_int pad_h = 0_n; + nonnegative_int pad_w = 0_n; + nonnegative_int kernel_h = 2_n; + nonnegative_int kernel_w = 2_n; + nonnegative_int stride_h = 2_n; + nonnegative_int stride_w = 2_n; PoolOp pool_type = PoolOp::MAX; @@ -21,23 +31,23 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); Pool2DPerDeviceState state = - Kernels::Pool2D::init_kernel(managed_handle.raw_handle(), - std::nullopt, - input_w, - input_h, - input_c, - input_n, - output_w, - output_h, - output_c, - output_n, - pad_h, - pad_w, - kernel_h, - kernel_w, - stride_h, - stride_w, - pool_type); + Kernels::Pool2D::init_kernel(/*handle=*/managed_handle.raw_handle(), + /*activation=*/std::nullopt, + /*input_w=*/input_w.unwrap_nonnegative(), + /*input_h=*/input_h.unwrap_nonnegative(), + /*input_c=*/input_c.unwrap_nonnegative(), + /*input_n=*/input_n.unwrap_nonnegative(), + /*output_w=*/output_w.unwrap_nonnegative(), + /*output_h=*/output_h.unwrap_nonnegative(), + /*output_c=*/output_c.unwrap_nonnegative(), + /*output_n=*/output_n.unwrap_nonnegative(), + /*pad_h=*/pad_h.unwrap_nonnegative(), + /*pad_w=*/pad_w.unwrap_nonnegative(), + /*kernel_h=*/kernel_h.unwrap_nonnegative(), + /*kernel_w=*/kernel_w.unwrap_nonnegative(), + /*stride_h=*/stride_h.unwrap_nonnegative(), + /*stride_w=*/stride_w.unwrap_nonnegative(), + /*pool_type=*/pool_type); TensorShape input_shape = make_tensor_shape_from_legion_dims( {input_w, input_h, input_c, input_n}, DataType::FLOAT); diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index a33748c0de..5078edee57 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -9,7 +9,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::size_t num_replicas = 5; TensorShape input_shape = make_tensor_shape_from_legion_dims( - {10, 10, 10, 10, 10}, DataType::FLOAT); + {10_n, 10_n, 10_n, 10_n, 10_n}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{ /*workSpaceSize=*/1024 * 1024, @@ -20,7 +20,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { TensorShape output_shape = - make_tensor_shape_from_legion_dims({10}, DataType::FLOAT); + + make_tensor_shape_from_legion_dims({10_n}, DataType::FLOAT); GenericTensorAccessorR input_accessor = create_random_filled_accessor_r(input_shape, allocator); diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 27223cc7b5..5133c4c89c 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -6,12 +6,12 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Call Replicate Forward and Backward Kernels") { - std::size_t num_replicas = 10; + nonnegative_int num_replicas = 10_n; TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{ /*workSpaceSize=*/1024 * 1024, @@ -48,12 +48,12 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") { - std::size_t num_replicas = 2; + nonnegative_int num_replicas = 10_n; TensorShape input_shape = - make_tensor_shape_from_legion_dims({5}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({5_n}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({5_n, num_replicas}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{ /*workSpaceSize=*/1024 * 1024, diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index 5c04012da2..ee7530c017 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); TensorShape output_shape = input_shape; ReshapePerDeviceState state = diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index c06919d603..bf23188a8f 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -7,9 +7,9 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Call Reverse Forward and Backward Kernels") { - std::size_t reverse_dim_size = 10; - std::size_t in_blk_size = 10; - std::size_t num_out_blks = 1; + nonnegative_int reverse_dim_size = 10_n; + nonnegative_int in_blk_size = 10_n; + nonnegative_int num_out_blks = 1_n; TensorShape input_shape = make_tensor_shape_from_legion_dims( {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); @@ -29,13 +29,14 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), - input_accessor.get_float_ptr(), - output_accessor.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - input_accessor.shape.num_elements()); + Kernels::Reverse::forward_kernel( + managed_stream.raw_stream(), + input_accessor.get_float_ptr(), + output_accessor.get_float_ptr(), + num_out_blks.unwrap_nonnegative(), + reverse_dim_size.unwrap_nonnegative(), + in_blk_size.unwrap_nonnegative(), + input_accessor.shape.num_elements().unwrap_nonnegative()); CHECK(contains_non_zero(output_accessor)); } @@ -50,19 +51,19 @@ TEST_SUITE(FF_TEST_SUITE) { managed_stream.raw_stream(), output_grad_accessor.get_float_ptr(), input_grad_accessor.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - input_grad_accessor.shape.num_elements()); + num_out_blks.unwrap_nonnegative(), + reverse_dim_size.unwrap_nonnegative(), + in_blk_size.unwrap_nonnegative(), + input_grad_accessor.shape.num_elements().unwrap_nonnegative()); CHECK(contains_non_zero(input_grad_accessor)); } } TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { - std::size_t num_out_blks = 4; - std::size_t reverse_dim_size = 3; - std::size_t in_blk_size = 2; + nonnegative_int num_out_blks = 4_n; + nonnegative_int reverse_dim_size = 3_n; + nonnegative_int in_blk_size = 2_n; TensorShape input_shape = make_tensor_shape_from_legion_dims( {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); @@ -90,10 +91,10 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), input_accessor_gpu.get_float_ptr(), output_accessor_gpu.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - input_accessor_gpu.shape.num_elements()); + num_out_blks.unwrap_nonnegative(), + reverse_dim_size.unwrap_nonnegative(), + in_blk_size.unwrap_nonnegative(), + input_accessor_gpu.shape.num_elements().unwrap_nonnegative()); // Run CPU Cast Forward Kernel GenericTensorAccessorR input_accessor_cpu = @@ -118,10 +119,10 @@ TEST_SUITE(FF_TEST_SUITE) { managed_stream.raw_stream(), output_grad_accessor_gpu.get_float_ptr(), input_grad_accessor_gpu.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - input_grad_accessor_gpu.shape.num_elements()); + num_out_blks.unwrap_nonnegative(), + reverse_dim_size.unwrap_nonnegative(), + in_blk_size.unwrap_nonnegative(), + input_grad_accessor_gpu.shape.num_elements().unwrap_nonnegative()); // Run CPU Cast Backward Kernel GenericTensorAccessorR output_grad_accessor_cpu = diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index 5519c30b80..d4fb496f7b 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -6,7 +6,11 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Softmax Kernel Operations") { - int input_n = 1, input_c = 1, input_h = 1, input_w = 100, channels = 100; + nonnegative_int input_n = 1_n; + nonnegative_int input_c = 1_n; + nonnegative_int input_h = 1_n; + nonnegative_int input_w = 100_n; + nonnegative_int channels = 100_n; ManagedPerDeviceFFHandle managed_handle{ /*workSpaceSize=*/1024 * 1024, @@ -16,11 +20,16 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); TensorShape output_shape = input_shape; - SoftmaxPerDeviceState state = Kernels::Softmax::init_kernel( - managed_handle.raw_handle(), 0, input_n, channels, input_h, input_w); + SoftmaxPerDeviceState state = + Kernels::Softmax::init_kernel(managed_handle.raw_handle(), + 0, + input_n.unwrap_nonnegative(), + channels.unwrap_nonnegative(), + input_h.unwrap_nonnegative(), + input_w.unwrap_nonnegative()); GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); @@ -47,7 +56,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_stream.raw_stream(), output_grad_accessor.get_float_ptr(), input_grad_accessor.get_float_ptr(), - output_grad_accessor.shape.num_elements()); + output_grad_accessor.shape.num_elements().unwrap_nonnegative()); CHECK(contains_non_zero(input_grad_accessor)); } diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index e94d102b71..d98f88a30e 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -8,7 +8,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Split Forward and Backward Kernel") { - size_t num_outputs = 2; + nonnegative_int num_outputs = 2_n; coord_t out_blk_sizes[] = {50, 50}; coord_t in_blk_size = 100; coord_t num_blks = 1; @@ -21,9 +21,9 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({50}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({50_n}, DataType::FLOAT); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = @@ -41,11 +41,11 @@ TEST_SUITE(FF_TEST_SUITE) { out_blk_sizes, in_blk_size, num_blks, - num_outputs); + num_outputs.unwrap_nonnegative()); } SUBCASE("backward_kernel") { - std::vector output_grad_ptrs(num_outputs); + std::vector output_grad_ptrs(num_outputs.unwrap_nonnegative()); for (int i = 0; i < num_outputs; i++) { GenericTensorAccessorW output_grad_accessor = create_random_filled_accessor_w(output_shape, allocator); @@ -61,7 +61,7 @@ TEST_SUITE(FF_TEST_SUITE) { out_blk_sizes, in_blk_size, num_blks, - num_outputs); + num_outputs.unwrap_nonnegative()); } } } diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index f87fb67921..cac43c6ff3 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -5,10 +5,12 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Transpose Kernel Operations") { - std::size_t num_dims = 2; - - std::vector perm = {ff_dim_t{nonnegative_int{0}}, - ff_dim_t{nonnegative_int{1}}}; + TransposeAttrs attrs = TransposeAttrs{ + FFOrdered{ + ff_dim_t{0_n}, + ff_dim_t{1_n}, + }, + }; ManagedPerDeviceFFHandle managed_handle{ /*workSpaceSize=*/1024 * 1024, @@ -17,11 +19,8 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); - TransposePerDeviceState state = - Kernels::Transpose::init_kernel(num_dims, perm); - TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({10_n, 10_n}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { @@ -31,7 +30,7 @@ TEST_SUITE(FF_TEST_SUITE) { allocator.allocate_tensor(output_shape); Kernels::Transpose::forward_kernel( - managed_stream.raw_stream(), state, input_accessor, output_accessor); + managed_stream.raw_stream(), attrs, input_accessor, output_accessor); CHECK(contains_non_zero(output_accessor)); } @@ -43,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) { create_random_filled_accessor_w(input_shape, allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), - state, + attrs, output_grad_accessor, input_grad_accessor); diff --git a/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml b/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml index 5f73bbbb8e..db476e771d 100644 --- a/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml +++ b/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml @@ -6,7 +6,7 @@ features = [ includes = [ "kernels/attention_kernels.h", - "kernels/batch_norm_kernels.h", + "kernels/batch_norm_per_device_state.dtg.h", "kernels/conv_2d_kernels.h", "kernels/dropout_kernels.h", "kernels/element_binary_kernels.h", @@ -84,7 +84,3 @@ key = "device_specific_softmax_per_device_state" [[values]] type = "::FlexFlow::DeviceSpecific<::FlexFlow::TopKPerDeviceState>" key = "device_specific_topk_per_device_state" - -[[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::TransposePerDeviceState>" -key = "device_specific_transpose_per_device_state" diff --git a/lib/local-execution/include/local-execution/legion_tensor_shape.h b/lib/local-execution/include/local-execution/legion_tensor_shape.h deleted file mode 100644 index 3786383865..0000000000 --- a/lib/local-execution/include/local-execution/legion_tensor_shape.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_TENSOR_SHAPE_H -#define _FLEXFLOW_RUNTIME_SRC_TENSOR_SHAPE_H - -#include "kernels/legion_dim.h" -#include "op-attrs/datatype.h" -#include "op-attrs/ff_dim_t.h" -#include "op-attrs/tensor_shape.dtg.h" -#include "utils/stack_vector/stack_vector.h" -#include "utils/visitable.h" -#include - -namespace FlexFlow { - -// TODO FIXME @lockshaw remove inheritance from legion tensor dims -struct LegionTensorShape : public use_visitable_cmp, - public LegionTensorDims { - LegionTensorShape() = delete; - LegionTensorShape(std::vector const &dims, DataType data_type); - LegionTensorShape(TensorShape const &); - - template - LegionTensorShape(stack_vector const &dims, - DataType data_type) - : LegionTensorDims(dims.start(), dims.end()), data_type(data_type) {} - - operator TensorShape() const; - -public: - DataType data_type; -}; - -ff_dim_t to_ff(legion_dim_t, size_t num_dims); -legion_dim_t legion_dim_from_ff_dim(ff_dim_t, size_t num_dims); - -ff_dim_t to_ff(legion_dim_t, TensorShape const &); -legion_dim_t legion_dim_from_ff_dim(ff_dim_t, TensorShape const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/task_id_t.enum.toml b/lib/local-execution/include/local-execution/task_id_t.enum.toml index 9cbe64c268..b0c82b5d26 100644 --- a/lib/local-execution/include/local-execution/task_id_t.enum.toml +++ b/lib/local-execution/include/local-execution/task_id_t.enum.toml @@ -205,9 +205,6 @@ name = "TOPK_FWD_TASK_ID" [[values]] name = "TOPK_BWD_TASK_ID" -[[values]] -name = "TRANSPOSE_INIT_TASK_ID" - [[values]] name = "TRANSPOSE_FWD_TASK_ID" diff --git a/lib/local-execution/src/legion_tensor_shape.cc b/lib/local-execution/src/legion_tensor_shape.cc deleted file mode 100644 index b227accc2e..0000000000 --- a/lib/local-execution/src/legion_tensor_shape.cc +++ /dev/null @@ -1,15 +0,0 @@ -#include "local-execution/legion_tensor_shape.h" -#include "kernels/legion_dim.h" -#include "op-attrs/tensor_shape.h" - -namespace FlexFlow { - -legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, size_t num_dims) { - return legion_dim_t(num_dims - ff_dim.value.get_value() - 1); -} - -legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, TensorShape const &shape) { - return legion_dim_from_ff_dim(ff_dim, num_dims(shape)); -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc index eebef9039d..e652b666a8 100644 --- a/lib/local-execution/src/ops/attention.cc +++ b/lib/local-execution/src/ops/attention.cc @@ -85,10 +85,10 @@ static DeviceSpecificDeviceStates init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); Allocator allocator = acc.get_allocator(); - size_t qProjSize = acc.get_argument(QPROJSIZE); - size_t kProjSize = acc.get_argument(KPROJSIZE); - size_t vProjSize = acc.get_argument(VPROJSIZE); - size_t oProjSize = acc.get_argument(OPROJSIZE); + nonnegative_int qProjSize = acc.get_argument(QPROJSIZE); + nonnegative_int kProjSize = acc.get_argument(KPROJSIZE); + nonnegative_int vProjSize = acc.get_argument(VPROJSIZE); + nonnegative_int oProjSize = acc.get_argument(OPROJSIZE); PerDeviceFFHandle handle = acc.get_argument(HANDLE); ParallelTensorShape query_parallel_tensor_shape = @@ -108,29 +108,30 @@ static DeviceSpecificDeviceStates key_parallel_tensor_shape, value_parallel_tensor_shape)); - int kvSeqLength = get_kvSeqLength(parsed); - int qSize = get_qSize(parsed); - int kSize = get_kSize(parsed); - int vSize = get_vSize(parsed); - - int qoSeqLength = get_qoSeqLength(parsed); - int num_samples = get_num_samples(parsed); - int num_heads = attrs.num_heads; - - MHAPerDeviceState per_device_state = init_kernel(handle, - allocator, - num_samples, - num_heads, - qSize, - kSize, - vSize, - qProjSize, - kProjSize, - vProjSize, - oProjSize, - qoSeqLength, - kvSeqLength, - attrs.add_bias_kv); + nonnegative_int kvSeqLength = get_kvSeqLength(parsed); + nonnegative_int qSize = get_qSize(parsed); + nonnegative_int kSize = get_kSize(parsed); + nonnegative_int vSize = get_vSize(parsed); + + nonnegative_int qoSeqLength = get_qoSeqLength(parsed); + nonnegative_int num_samples = get_num_samples(parsed); + nonnegative_int num_heads = attrs.num_heads; + + MHAPerDeviceState per_device_state = + init_kernel(handle, + allocator, + num_samples.unwrap_nonnegative(), + num_heads.unwrap_nonnegative(), + qSize.unwrap_nonnegative(), + kSize.unwrap_nonnegative(), + vSize.unwrap_nonnegative(), + qProjSize.unwrap_nonnegative(), + kProjSize.unwrap_nonnegative(), + vProjSize.unwrap_nonnegative(), + oProjSize.unwrap_nonnegative(), + qoSeqLength.unwrap_nonnegative(), + kvSeqLength.unwrap_nonnegative(), + attrs.add_bias_kv); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; } diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc index 1eae409ae2..ad331156b5 100644 --- a/lib/local-execution/src/ops/batch_matmul.cc +++ b/lib/local-execution/src/ops/batch_matmul.cc @@ -18,6 +18,8 @@ #include "local-execution/op_task_signature.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/batch_matmul.h" +#include "utils/containers/transform.h" +#include "utils/nonnegative_int/nonnegative_range.h" namespace FlexFlow { @@ -65,24 +67,30 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { FFIterationConfig iter_config = acc.get_argument(ITERATION_CONFIG); - int m = b_input.shape[legion_dim_t(0)]; - assert(m == output.shape[legion_dim_t(0)]); - int n = a_input.shape[legion_dim_t(1)]; - assert(n == output.shape[legion_dim_t(1)]); - int k = a_input.shape[legion_dim_t(0)]; - assert(k == b_input.shape[legion_dim_t(1)]); + nonnegative_int m = b_input.shape.at(legion_dim_t{0_n}); + assert(m == output.shape.at(legion_dim_t{0_n})); + nonnegative_int n = a_input.shape.at(legion_dim_t{1_n}); + assert(n == output.shape.at(legion_dim_t{1_n})); + nonnegative_int k = a_input.shape.at(legion_dim_t{0_n}); + assert(k == b_input.shape.at(legion_dim_t{1_n})); assert(a_input.shape.get_volume() == b_input.shape.get_volume()); assert(a_input.shape.get_volume() == output.shape.get_volume()); - int batch = 1; - for (int i = 2; i < a_input.shape.get_dim(); i++) { - int dim_size = a_input.shape[legion_dim_t(i)]; - assert(dim_size == b_input.shape[legion_dim_t(i)]); - assert(dim_size == output.shape[legion_dim_t(i)]); + nonnegative_int batch = 1_n; + for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.get_dim())) { + nonnegative_int dim_size = a_input.shape.at(legion_dim_t{i}); + assert(dim_size == b_input.shape.at(legion_dim_t{i})); + assert(dim_size == output.shape.at(legion_dim_t{i})); batch *= dim_size; } + auto get_raw_seq_len = [](std::optional seq_len) -> int { + return transform(seq_len, + [](nonnegative_int x) { return x.unwrap_nonnegative(); }) + .value_or(-1); + }; + return profile(forward_kernel, profiling, "[BatchMatmul] forward_time = {:.2lf}ms\n", @@ -90,12 +98,12 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { output.get_float_ptr(), a_input.get_float_ptr(), b_input.get_float_ptr(), - m, - n, - k, - batch, - attrs.a_seq_length_dim, - attrs.b_seq_length_dim, + m.unwrap_nonnegative(), + n.unwrap_nonnegative(), + k.unwrap_nonnegative(), + batch.unwrap_nonnegative(), + get_raw_seq_len(attrs.a_seq_length_dim), + get_raw_seq_len(attrs.b_seq_length_dim), iter_config.seq_length); } @@ -120,19 +128,20 @@ static std::optional assert(b_input.shape == b_input_grad.shape); // check dins - int m = b_input.shape[legion_dim_t(0)]; - assert(m == output.shape[legion_dim_t(0)]); - int n = a_input.shape[legion_dim_t(1)]; - assert(n == output.shape[legion_dim_t(1)]); - int k = a_input.shape[legion_dim_t(0)]; - assert(k == b_input.shape[legion_dim_t(1)]); + nonnegative_int m = b_input.shape.at(legion_dim_t{0_n}); + assert(m == output.shape.at(legion_dim_t{0_n})); + nonnegative_int n = a_input.shape.at(legion_dim_t{1_n}); + assert(n == output.shape.at(legion_dim_t{1_n})); + nonnegative_int k = a_input.shape.at(legion_dim_t{0_n}); + assert(k == b_input.shape.at(legion_dim_t{1_n})); assert(a_input.shape.get_volume() == b_input.shape.get_volume()); assert(a_input.shape.get_volume() == output.shape.get_volume()); - int batch = 1; - for (int i = 2; i < a_input.shape.dims.num_dims(); i++) { - int dim_size = a_input.shape[legion_dim_t(i)]; - assert(dim_size == b_input.shape[legion_dim_t(i)]); - assert(dim_size == output.shape[legion_dim_t(i)]); + + nonnegative_int batch = 1_n; + for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.get_dim())) { + nonnegative_int dim_size = a_input.shape.at(legion_dim_t{i}); + assert(dim_size == b_input.shape.at(legion_dim_t{i})); + assert(dim_size == output.shape.at(legion_dim_t{i})); batch *= dim_size; } @@ -146,10 +155,10 @@ static std::optional a_input_grad.get_float_ptr(), b_input.get_float_ptr(), b_input_grad.get_float_ptr(), - m, - n, - k, - batch); + m.unwrap_nonnegative(), + n.unwrap_nonnegative(), + k.unwrap_nonnegative(), + batch.unwrap_nonnegative()); } TaskImplFunction get_batch_matmul_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/batch_matmul.h b/lib/local-execution/src/ops/batch_matmul.h index a7e29b1931..23389d5083 100644 --- a/lib/local-execution/src/ops/batch_matmul.h +++ b/lib/local-execution/src/ops/batch_matmul.h @@ -4,7 +4,7 @@ #include "local-execution/op_task_invocation.h" #include "local-execution/op_task_signature.h" #include "local-execution/sim_environment.h" -#include "op-attrs/ops/batch_matmul.dtg.h" +#include "op-attrs/ops/batch_matmul_attrs.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc index 3aed3111c7..5cf8742918 100644 --- a/lib/local-execution/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -75,21 +75,22 @@ static DeviceSpecificDeviceStates auto output = acc.get_tensor(OUTPUT); auto const &attrs = acc.get_argument(ATTRS); - int output_w = output.shape[legion_dim_t(0)]; - int output_h = output.shape[legion_dim_t(1)]; - int output_c = output.shape[legion_dim_t(2)]; - int output_n = output.shape[legion_dim_t(3)]; + nonnegative_int output_w = output.shape.at(legion_dim_t{0_n}); + nonnegative_int output_h = output.shape.at(legion_dim_t{1_n}); + nonnegative_int output_c = output.shape.at(legion_dim_t{2_n}); + nonnegative_int output_n = output.shape.at(legion_dim_t{3_n}); float *runningMean; - BatchNormPerDeviceState per_device_state = init_kernel(handle, - allocator, - runningMean, - output_n, - output_c, - output_h, - output_w, - attrs.relu); + BatchNormPerDeviceState per_device_state = + init_kernel(handle, + allocator, + runningMean, + output_n.unwrap_nonnegative(), + output_c.unwrap_nonnegative(), + output_h.unwrap_nonnegative(), + output_w.unwrap_nonnegative(), + attrs.relu); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; @@ -140,7 +141,7 @@ static std::optional scale.get_float_ptr(), scale_grad.get_float_ptr(), bias_grad.get_float_ptr(), - output.shape.get_volume()); + output.shape.get_volume().unwrap_nonnegative()); } TaskImplFunction get_batch_norm_init_task_impl() { diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc index d7c5c22170..c4d9c4b21d 100644 --- a/lib/local-execution/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -62,19 +62,19 @@ static DeviceSpecificDeviceStates auto filter_grad = acc.get_tensor_grad(FILTER); Conv2DPerDeviceState per_device_state = - init_kernel(handle, - attrs.activation, - attrs.kernel_h, - attrs.kernel_w, - attrs.groups, - attrs.padding_h, - attrs.padding_w, - attrs.stride_h, - attrs.stride_w, - input, - output, - filter.get_float_ptr(), - filter_grad.get_float_ptr()); + init_kernel(/*handle=*/handle, + /*activation=*/attrs.activation, + /*kernel_h=*/attrs.kernel_h.unwrap_nonnegative(), + /*kernel_w=*/attrs.kernel_w.unwrap_nonnegative(), + /*groups=*/attrs.groups.unwrap_nonnegative(), + /*padding_h=*/attrs.padding_h.unwrap_nonnegative(), + /*padding_w=*/attrs.padding_w.unwrap_nonnegative(), + /*stride_h=*/attrs.stride_h.unwrap_nonnegative(), + /*stride_w=*/attrs.stride_w.unwrap_nonnegative(), + /*input=*/input, + /*output=*/output, + /*filter_ptr=*/filter.get_float_ptr(), + /*filter_grad_ptr=*/filter_grad.get_float_ptr()); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; } diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc index a015c64f4d..a43c0f757f 100644 --- a/lib/local-execution/src/ops/gather.cc +++ b/lib/local-execution/src/ops/gather.cc @@ -15,8 +15,8 @@ #include "gather.h" #include "kernels/gather_kernels.h" -#include "local-execution/legion_tensor_shape.h" #include "op-attrs/get_output_shapes.h" +#include "utils/nonnegative_int/nonnegative_range.h" #include namespace FlexFlow { @@ -72,10 +72,11 @@ static DeviceSpecificDeviceStates assert(input.shape.get_dim() == index.shape.get_dim()); assert(output.shape.get_dim() == index.shape.get_dim()); - for (int i = 0; i < input.shape.get_dim(); i++) { - assert(index.shape[legion_dim_t(i)] == output.shape[legion_dim_t(i)]); + for (nonnegative_int i : nonnegative_range(input.shape.get_dim())) { + assert(index.shape.at(legion_dim_t{i}) == output.shape.at(legion_dim_t{i})); if (i != legion_dim.value) { - assert(input.shape[legion_dim_t(i)] == index.shape[legion_dim_t(i)]); + assert(input.shape.at(legion_dim_t{i}) == + index.shape.at(legion_dim_t{i})); } } diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc index e99d27319c..c01475d4a4 100644 --- a/lib/local-execution/src/ops/layer_norm.cc +++ b/lib/local-execution/src/ops/layer_norm.cc @@ -15,12 +15,12 @@ #include "layer_norm.h" #include "kernels/layer_norm_kernels.h" -#include "local-execution/legion_tensor_shape.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/layer_norm.h" #include "op-attrs/parallel_tensor_shape.h" #include "utils/exception.h" #include "utils/hash-utils.h" +#include "utils/nonnegative_int/nonnegative_range.h" #include namespace FlexFlow { @@ -119,27 +119,25 @@ static DeviceSpecificDeviceStates auto input = acc.get_tensor(INPUT); auto handle = acc.get_argument(HANDLE); - // question: how to get batch_size and effective_num_elements - int64_t effective_batch_size, effective_num_elements; - int M = 1; + nonnegative_int M = 1_n; for (int i = 0; i < attrs.axes.size(); i++) { - legion_dim_t legion_dim = legion_dim_from_ff_dim( - attrs.axes[i], get_tensor_shape(input.shape, input.data_type)); + legion_dim_t legion_dim = + legion_dim_from_ff_dim(attrs.axes[i], input.shape.num_dims()); M *= input.shape.at(legion_dim); } - int num_replicas = 1; - for (int i = 0; i < input.shape.num_dims(); i++) { - num_replicas *= input.shape.at(legion_dim_t(i)); - effective_num_elements = M; - effective_batch_size = input.shape.get_volume() / M; + nonnegative_int num_replicas = 1_n; + for (nonnegative_int i : nonnegative_range(input.shape.num_dims())) { + num_replicas *= input.shape.at(legion_dim_t{i}); } + nonnegative_int effective_num_elements = M; + nonnegative_int effective_batch_size = input.shape.get_volume() / M; LayerNormPerDeviceState per_device_state = init_kernel(handle, allocator, attrs.elementwise_affine, - effective_batch_size, - effective_num_elements, + effective_batch_size.unwrap_nonnegative(), + effective_num_elements.unwrap_nonnegative(), attrs.eps); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 1eb0360db4..2de850f209 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -66,21 +66,22 @@ static DeviceSpecificDeviceStates auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); - int out_dim = output.shape.at(ff_dim_t{nonnegative_int{0}}); - int batch_size = output.shape.at(ff_dim_t{nonnegative_int{1}}); + nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n}); + nonnegative_int batch_size = output.shape.at(ff_dim_t{1_n}); float *one_ptr; - LinearPerDeviceState per_device_state = init_kernel(handle, - one_ptr, - attrs.activation, - attrs.regularizer, - attrs.use_bias, - input.data_type, - weight.data_type, - output.data_type, - batch_size, - attrs.out_channels); + LinearPerDeviceState per_device_state = + init_kernel(handle, + one_ptr, + attrs.activation, + attrs.regularizer, + attrs.use_bias, + input.data_type, + weight.data_type, + output.data_type, + batch_size.unwrap_nonnegative(), + attrs.out_channels.unwrap_nonnegative()); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; } @@ -96,9 +97,9 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); - int in_dim = input.shape.at(ff_dim_t{nonnegative_int{0}}) + 1; - int out_dim = output.shape.at(ff_dim_t{nonnegative_int{0}}) + 1; - int batch_size = output.shape.get_volume() / out_dim; + nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n}); + nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n}); + nonnegative_int batch_size = output.shape.get_volume() / out_dim; float const *bias_ptr = NULL; if (attrs.use_bias) { @@ -113,9 +114,9 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { output.get_float_ptr(), weight.get_float_ptr(), bias_ptr, - in_dim, - out_dim, - batch_size); + in_dim.unwrap_nonnegative(), + out_dim.unwrap_nonnegative(), + batch_size.unwrap_nonnegative()); } ; @@ -140,9 +141,9 @@ static std::optional bias_ptr = bias.get_float_ptr(); } - int in_dim = input.shape.at(ff_dim_t{nonnegative_int{0}}) + 1; - int out_dim = output.shape.at(ff_dim_t{nonnegative_int{0}}) + 1; - int batch_size = output.shape.get_volume() / out_dim; + nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n}); + nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n}); + nonnegative_int batch_size = output.shape.get_volume() / out_dim; return profile(backward_kernel, profiling, @@ -155,9 +156,9 @@ static std::optional weight.get_float_ptr(), weight_grad.get_float_ptr(), bias_ptr, - in_dim, - out_dim, - batch_size); + in_dim.unwrap_nonnegative(), + out_dim.unwrap_nonnegative(), + batch_size.unwrap_nonnegative()); } TaskImplFunction get_linear_init_task_impl() { diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc index a1167a731c..13f6a78381 100644 --- a/lib/local-execution/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -22,6 +22,20 @@ OpTaskInvocation init(Pool2DAttrs const &attrs) { return {task_id_t::POOL2D_INIT_TASK_ID, binding}; } +static nonnegative_int calculate_padding(nonnegative_int output_size, + nonnegative_int stride, + nonnegative_int kernel_size, + nonnegative_int input_size) { + int o = output_size.unwrap_nonnegative(); + int s = stride.unwrap_nonnegative(); + int k = kernel_size.unwrap_nonnegative(); + int i = kernel_size.unwrap_nonnegative(); + + return nonnegative_int{ + ((o - 1) * s + k - i + 1) / 2, + }; +} + static DeviceSpecificDeviceStates init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); @@ -30,56 +44,33 @@ static DeviceSpecificDeviceStates auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - int input_w = input.shape.at(ff_dim_t{nonnegative_int{0}}) + 1; - int input_h = input.shape.at(ff_dim_t{nonnegative_int{1}}) + 1; - int input_c = input.shape.at(ff_dim_t{nonnegative_int{2}}) + 1; - int input_n = input.shape.at(ff_dim_t{nonnegative_int{3}}) + 1; - int output_w = output.shape.at(ff_dim_t{nonnegative_int{0}}) + 1; - int output_h = output.shape.at(ff_dim_t{nonnegative_int{1}}) + 1; - int output_c = output.shape.at(ff_dim_t{nonnegative_int{2}}) + 1; - int output_n = output.shape.at(ff_dim_t{nonnegative_int{3}}) + 1; - - printf("init pool (input): n(%d) c(%d) h(%d) " - "w(%d)\n", - input_n, - input_c, - input_h, - input_w); - printf("init pool (output): n(%d) c(%d) h(%d) w(%d)\n", - output_n, - output_c, - output_h, - output_w); - - int pad_h = - ((output_h - 1) * attrs.stride_h + attrs.kernel_h - input_h + 1) / 2; - int pad_w = - ((output_w - 1) * attrs.stride_w + attrs.kernel_w - input_w + 1) / 2; - if (pad_h != attrs.padding_h) { - printf("Warning: changing pool_padding_h to satisfy output_h size\n"); - } - - if (pad_w != attrs.padding_w) { - printf("Warning: changing pool_padding_w to satisfy output_w size\n"); - } - - Pool2DPerDeviceState per_device_state = init_kernel(handle, - attrs.activation, - input_w, - input_h, - input_c, - input_n, - output_w, - output_h, - output_c, - output_n, - pad_h, - pad_w, - attrs.kernel_h, - attrs.kernel_w, - attrs.stride_h, - attrs.stride_w, - attrs.pool_type); + nonnegative_int input_w = input.shape.at(ff_dim_t{0_n}); + nonnegative_int input_h = input.shape.at(ff_dim_t{1_n}); + nonnegative_int input_c = input.shape.at(ff_dim_t{2_n}); + nonnegative_int input_n = input.shape.at(ff_dim_t{3_n}); + nonnegative_int output_w = output.shape.at(ff_dim_t{0_n}); + nonnegative_int output_h = output.shape.at(ff_dim_t{1_n}); + nonnegative_int output_c = output.shape.at(ff_dim_t{2_n}); + nonnegative_int output_n = output.shape.at(ff_dim_t{3_n}); + + Pool2DPerDeviceState per_device_state = + init_kernel(handle, + attrs.activation, + input_w.unwrap_nonnegative(), + input_h.unwrap_nonnegative(), + input_c.unwrap_nonnegative(), + input_n.unwrap_nonnegative(), + output_w.unwrap_nonnegative(), + output_h.unwrap_nonnegative(), + output_c.unwrap_nonnegative(), + output_n.unwrap_nonnegative(), + attrs.padding_h.unwrap_nonnegative(), + attrs.padding_w.unwrap_nonnegative(), + attrs.kernel_h.unwrap_nonnegative(), + attrs.kernel_w.unwrap_nonnegative(), + attrs.stride_h.unwrap_nonnegative(), + attrs.stride_w.unwrap_nonnegative(), + attrs.pool_type); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc index a043d9f847..3f92d7fd77 100644 --- a/lib/local-execution/src/ops/reduce.cc +++ b/lib/local-execution/src/ops/reduce.cc @@ -41,9 +41,14 @@ static DeviceSpecificDeviceStates OperatorType op_type = attrs.op_type; - size_t reduction_size = input.shape.get_volume() / output.shape.get_volume(); + nonnegative_int reduction_size = + input.shape.get_volume() / output.shape.get_volume(); ReducePerDeviceState per_device_state = - init_kernel(handle, op_type, reduction_size, input.shape, output.shape); + init_kernel(handle, + op_type, + reduction_size.unwrap_nonnegative(), + input.shape, + output.shape); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; } diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc index 1e85d7186e..cab7c3e22d 100644 --- a/lib/local-execution/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -50,14 +50,14 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); - size_t num_replicas = attrs.reduction_degree; + nonnegative_int num_replicas = attrs.reduction_degree; return profile(forward_kernel, profiling_settings, "[Reduction] forward_time = {:.2lf}ms\n", input, output, - num_replicas); + num_replicas.unwrap_nonnegative()); } static std::optional diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc index 56bbfdd371..17e0065de5 100644 --- a/lib/local-execution/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -62,14 +62,14 @@ static std::optional auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); - auto const &attrs = acc.get_argument(ATTRS); + auto attrs = acc.get_argument(ATTRS); return profile(backward_kernel, profiling, "[replicate] backward_time = {:.2lf}ms\n", output_grad, input_grad, - attrs.replicate_degree); + attrs.replicate_degree.unwrap_nonnegative()); } TaskImplFunction get_replicate_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc index 8ac4c045c7..94dfc90f7a 100644 --- a/lib/local-execution/src/ops/reverse.cc +++ b/lib/local-execution/src/ops/reverse.cc @@ -17,6 +17,7 @@ #include "kernels/accessor.h" #include "kernels/reverse_kernels.h" #include "op-attrs/get_output_shapes.h" +#include "utils/nonnegative_int/nonnegative_range.h" namespace FlexFlow { @@ -48,16 +49,18 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); - int output_size = output.shape.get_volume(); + nonnegative_int output_size = output.shape.get_volume(); auto axis = attrs.axis; - coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1; - for (int i = 0; i < output.shape.get_dim(); i++) { + nonnegative_int in_blk_size = 1_n; + nonnegative_int reverse_dim_size = 1_n; + nonnegative_int num_out_blks = 1_n; + for (nonnegative_int i : nonnegative_range(output.shape.get_dim())) { if (i < axis.value) { - in_blk_size *= output.shape.at(ff_dim_t{nonnegative_int{i}}); + in_blk_size *= output.shape.at(ff_dim_t{i}); } else if (i == axis.value) { - reverse_dim_size = output.shape.at(ff_dim_t{nonnegative_int{i}}); + reverse_dim_size = output.shape.at(ff_dim_t{i}); } else { - num_out_blks *= output.shape.at(ff_dim_t{nonnegative_int{i}}); + num_out_blks *= output.shape.at(ff_dim_t{i}); } } @@ -66,10 +69,10 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { "[reverse] forward_time = {:.2lf}ms\n", input.get_float_ptr(), output.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - output_size); + num_out_blks.unwrap_nonnegative(), + reverse_dim_size.unwrap_nonnegative(), + in_blk_size.unwrap_nonnegative(), + output_size.unwrap_nonnegative()); } static std::optional @@ -79,15 +82,18 @@ static std::optional auto output_grad = acc.get_tensor_grad(OUTPUT); auto attrs = acc.get_argument(ATTRS); - int axis = input_grad.shape.get_dim() - attrs.axis.value.get_value() - 1; - coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1; - for (int i = 0; i < input_grad.shape.get_dim(); i++) { + int axis = input_grad.shape.num_dims().unwrap_nonnegative() - + attrs.axis.value.unwrap_nonnegative() - 1; + nonnegative_int in_blk_size = 1_n; + nonnegative_int reverse_dim_size = 1_n; + nonnegative_int num_out_blks = 1_n; + for (nonnegative_int i : nonnegative_range(input_grad.shape.get_dim())) { if (i < axis) { - in_blk_size *= input_grad.shape.at(ff_dim_t{nonnegative_int{i}}); + in_blk_size *= input_grad.shape.at(ff_dim_t{i}); } else if (i == axis) { - reverse_dim_size = input_grad.shape.at(ff_dim_t{nonnegative_int{i}}); + reverse_dim_size = input_grad.shape.at(ff_dim_t{i}); } else { - num_out_blks *= input_grad.shape.at(ff_dim_t{nonnegative_int{i}}); + num_out_blks *= input_grad.shape.at(ff_dim_t{i}); } } @@ -96,10 +102,10 @@ static std::optional "[reverse] backward_time = {:.2lf}ms\n", output_grad.get_float_ptr(), input_grad.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - input_grad.shape.get_volume()); + num_out_blks.unwrap_nonnegative(), + reverse_dim_size.unwrap_nonnegative(), + in_blk_size.unwrap_nonnegative(), + input_grad.shape.get_volume().unwrap_nonnegative()); } TaskImplFunction get_reverse_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc index 71a6ce435e..a1f29e2c98 100644 --- a/lib/local-execution/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -59,18 +59,18 @@ static DeviceSpecificDeviceStates auto output = acc.get_tensor(OUTPUT); auto const &attrs = acc.get_argument(ATTRS); - int output_w = output.shape.at(legion_dim_t(0)); - int output_h = output.shape.at(legion_dim_t(1)); - int output_c = output.shape.at(legion_dim_t(2)); - int output_n = output.shape.at(legion_dim_t(3)); + nonnegative_int output_w = output.shape.at(legion_dim_t{0_n}); + nonnegative_int output_h = output.shape.at(legion_dim_t{1_n}); + nonnegative_int output_c = output.shape.at(legion_dim_t{2_n}); + nonnegative_int output_n = output.shape.at(legion_dim_t{3_n}); SoftmaxPerDeviceState per_device_state = init_kernel(handle, - attrs.dim.value.get_value(), - output_n, - output_c, - output_h, - output_w); + attrs.dim.value.unwrap_nonnegative(), + output_n.unwrap_nonnegative(), + output_c.unwrap_nonnegative(), + output_h.unwrap_nonnegative(), + output_w.unwrap_nonnegative()); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; @@ -109,7 +109,7 @@ static std::optional "[SoftMax] backward_time = {:.2lf}ms\n", output_grad.get_float_ptr(), input_grad.get_float_ptr(), - output_grad.shape.get_volume()); + output_grad.shape.get_volume().unwrap_nonnegative()); } TaskImplFunction get_softmax_init_task_impl() { diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/ops/split.cc index c289bca205..f119ae235b 100644 --- a/lib/local-execution/src/ops/split.cc +++ b/lib/local-execution/src/ops/split.cc @@ -19,6 +19,7 @@ #include "op-attrs/get_output_shapes.h" #include "utils/exception.h" #include "utils/hash-utils.h" +#include "utils/nonnegative_int/nonnegative_range.h" namespace FlexFlow { @@ -44,19 +45,18 @@ OpTaskInvocation backward(SplitAttrs const &attrs) { return {task_id_t::SPLIT_BWD_TASK_ID, binding}; } -void calc_block_size(coord_t &num_blocks, - coord_t &block_size, - ArrayShape const &array_shape, - ff_dim_t axis) { - num_blocks = 1; - block_size = 1; - for (int d = 0; d < array_shape.num_elements(); d++) { - if (d <= axis.value.get_value()) { - block_size *= array_shape.at(legion_dim_t(d)); +static std::pair + calc_block_size(ArrayShape const &array_shape, ff_dim_t axis) { + nonnegative_int num_blocks = 1_n; + nonnegative_int block_size = 1_n; + for (nonnegative_int d : nonnegative_range(array_shape.num_elements())) { + if (d <= axis.value) { + block_size *= array_shape.at(legion_dim_t{d}); } else { - num_blocks *= array_shape.at(legion_dim_t(d)); + num_blocks *= array_shape.at(legion_dim_t{d}); } } + return {num_blocks, block_size}; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { @@ -65,13 +65,12 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); - coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS]; - calc_block_size(num_blocks, in_block_size, input.shape, attrs.axis); + coord_t out_block_sizes[MAX_NUM_OUTPUTS]; + auto [num_blocks, in_block_size] = calc_block_size(input.shape, attrs.axis); for (int i = 0; i < attrs.splits.size(); i++) { - coord_t out_num_blocks; - calc_block_size( - out_num_blocks, out_block_size[i], output.shape, attrs.axis); + auto [_, out_block_size] = calc_block_size(output.shape, attrs.axis); + out_block_sizes[i] = out_block_size.unwrap_nonnegative(); } float *output_float_ptr = output.get_float_ptr(); return profile(forward_kernel, @@ -79,9 +78,9 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { "Split forward_time = {:.2lf}ms\n", &output_float_ptr, input.get_float_ptr(), - out_block_size, - in_block_size, - num_blocks, + out_block_sizes, + in_block_size.unwrap_nonnegative(), + num_blocks.unwrap_nonnegative(), attrs.splits.size()); } @@ -93,12 +92,14 @@ static std::optional auto output_grad = acc.get_tensor_grad(OUTPUT); auto attrs = acc.get_argument(ATTRS); - coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS]; - calc_block_size(num_blocks, in_block_size, input_grad.shape, attrs.axis); + coord_t out_block_sizes[MAX_NUM_OUTPUTS]; + auto [num_blocks, in_block_size] = + calc_block_size(input_grad.shape, attrs.axis); + for (int i = 0; i < attrs.splits.size(); i++) { coord_t out_num_blocks; - calc_block_size( - out_num_blocks, out_block_size[i], output_grad.shape, attrs.axis); + auto [_, out_block_size] = calc_block_size(output_grad.shape, attrs.axis); + out_block_sizes[i] = out_block_size.unwrap_nonnegative(); } float const *output_grad_ptr = output_grad.get_float_ptr(); return profile(backward_kernel, @@ -106,9 +107,9 @@ static std::optional "Split backward_time = {:.2lf}ms\n", input_grad.get_float_ptr(), &output_grad_ptr, - out_block_size, - in_block_size, - num_blocks, + out_block_sizes, + in_block_size.unwrap_nonnegative(), + num_blocks.unwrap_nonnegative(), attrs.splits.size()); } diff --git a/lib/local-execution/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc index 7f3519529a..e9d202a38f 100644 --- a/lib/local-execution/src/ops/topk.cc +++ b/lib/local-execution/src/ops/topk.cc @@ -75,8 +75,8 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - int length = input.shape.at(legion_dim_t(0)) + 1; - size_t batch_size = input.shape.get_volume() / length; + nonnegative_int length = input.shape.at(legion_dim_t{0_n}); + nonnegative_int batch_size = input.shape.get_volume() / length; auto indices = acc.get_tensor(INDICES); return profile(forward_kernel, @@ -86,9 +86,9 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { input.get_float_ptr(), output.get_float_ptr(), indices.get_int32_ptr(), - batch_size, - length, - attrs.k, + batch_size.unwrap_nonnegative(), + length.unwrap_nonnegative(), + attrs.k.unwrap_nonnegative(), attrs.sorted); } @@ -104,8 +104,8 @@ static std::optional auto indices = acc.get_tensor(INDICES); - int length = input_grad.shape.at(legion_dim_t(0)) + 1; - size_t batch_size = input_grad.shape.get_volume() / length; + nonnegative_int length = input_grad.shape.at(legion_dim_t{0_n}); + nonnegative_int batch_size = input_grad.shape.get_volume() / length; return profile(backward_kernel, profiling, @@ -114,9 +114,9 @@ static std::optional output_grad.get_float_ptr(), indices.get_int32_ptr(), input_grad.get_float_ptr(), - batch_size, - length, - attrs.k); + batch_size.unwrap_nonnegative(), + length.unwrap_nonnegative(), + attrs.k.unwrap_nonnegative()); } TaskImplFunction get_topk_init_task_impl() { diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc index 30310d3349..0769cbb76f 100644 --- a/lib/local-execution/src/ops/transpose.cc +++ b/lib/local-execution/src/ops/transpose.cc @@ -28,39 +28,11 @@ enum Slots { OUTPUT, // tensor ATTRS, PROFILING, - PER_DEVICE_STATE, }; -OpTaskInvocation init(TransposeAttrs const &attrs) { - OpTaskBinding binding; - binding.bind_arg(ATTRS, attrs); - return {task_id_t::TRANSPOSE_INIT_TASK_ID, binding}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - auto const &attrs = acc.get_argument(ATTRS); - int size = int_from_size_t(attrs.perm.size()); - - std::vector perm = [&] { - std::vector result; - for (int i : range(size)) { - result.push_back(ff_dim_t{nonnegative_int{size - i - 1}}); - } - return result; - }(); - - TransposePerDeviceState per_device_state = init_kernel(size, perm); - - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - OpTaskInvocation forward(TransposeAttrs const &attrs) { OpTaskBinding binding; - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); binding.bind_arg(PROFILING, profiling_settings()); binding.bind(INPUT, input_tensor(0)); @@ -71,8 +43,7 @@ OpTaskInvocation forward(TransposeAttrs const &attrs) { static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); + auto attrs = acc.get_argument(ATTRS); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); @@ -80,7 +51,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, "[Transpose] Forward_time = {:.2lf} [ms]", - per_device_state, + attrs, input, output); } @@ -88,8 +59,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); + auto attrs = acc.get_argument(ATTRS); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); @@ -97,7 +67,7 @@ static std::optional return profile(backward_kernel, profiling, "[Transpose] Backward_time = {:.2lf} [ms]", - per_device_state, + attrs, output_grad, input_grad); } @@ -108,42 +78,31 @@ OpTaskInvocation backward(TransposeAttrs const &attrs) { return {task_id_t::TRANSPOSE_BWD_TASK_ID, binding}; } -TaskImplFunction get_transpose_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; -} TaskImplFunction get_transpose_fwd_task_impl() { return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; } + TaskImplFunction get_transpose_bwd_task_impl() { return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; } -OpTaskSignature get_transpose_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_arg_slot(ATTRS); - init.add_return_value(); - return init; -} OpTaskSignature get_transpose_fwd_signature() { OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); return fwd; } + OpTaskSignature get_transpose_bwd_signature() { OpTaskSignature bwd = infer_bwd_signature(get_transpose_fwd_signature()); return bwd; } std::vector get_task_ids(TransposeAttrs const &) { - return {task_id_t::TRANSPOSE_INIT_TASK_ID, - task_id_t::TRANSPOSE_FWD_TASK_ID, - task_id_t::TRANSPOSE_BWD_TASK_ID}; + return {task_id_t::TRANSPOSE_FWD_TASK_ID, task_id_t::TRANSPOSE_BWD_TASK_ID}; } } // namespace FlexFlow diff --git a/lib/local-execution/src/ops/transpose.h b/lib/local-execution/src/ops/transpose.h index 0f3a2e80a0..f2ce014aa7 100644 --- a/lib/local-execution/src/ops/transpose.h +++ b/lib/local-execution/src/ops/transpose.h @@ -9,15 +9,12 @@ namespace FlexFlow { std::vector get_task_ids(TransposeAttrs const &); -TaskImplFunction get_transpose_init_task_impl(); TaskImplFunction get_transpose_fwd_task_impl(); TaskImplFunction get_transpose_bwd_task_impl(); -OpTaskSignature get_transpose_init_signature(); OpTaskSignature get_transpose_fwd_signature(); OpTaskSignature get_transpose_bwd_signature(); -OpTaskInvocation init(TransposeAttrs const &); OpTaskInvocation forward(TransposeAttrs const &); OpTaskInvocation backward(TransposeAttrs const &); diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/local-execution/src/task_signature_impl.cc index ca428aad25..60928d42d7 100644 --- a/lib/local-execution/src/task_signature_impl.cc +++ b/lib/local-execution/src/task_signature_impl.cc @@ -193,9 +193,6 @@ TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) { case task_id_t::TOPK_BWD_TASK_ID: return TaskSignatureAndImpl{get_topk_bwd_task_impl(), get_topk_bwd_signature()}; - case task_id_t::TRANSPOSE_INIT_TASK_ID: - return TaskSignatureAndImpl{get_transpose_init_task_impl(), - get_transpose_init_signature()}; case task_id_t::TRANSPOSE_FWD_TASK_ID: return TaskSignatureAndImpl{get_transpose_fwd_task_impl(), get_transpose_fwd_signature()}; @@ -296,7 +293,6 @@ OpTaskInvocation init(ComputationGraphOpAttrs const &op) { [](ReshapeAttrs const &attrs) { return init(attrs); }, [](SoftmaxAttrs const &attrs) { return init(attrs); }, [](TopKAttrs const &attrs) { return init(attrs); }, - [](TransposeAttrs const &attrs) { return init(attrs); }, [](auto const &attrs) -> OpTaskInvocation { throw mk_runtime_error(fmt::format("Unhandled attr type {}", attrs)); }, diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc index 1ec441fbca..46827e3981 100644 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ b/lib/local-execution/test/src/test_local_slots_backing.cc @@ -19,16 +19,17 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("LocalSlotsBacking -- Attention Op") { // allocate input memory Allocator allocator = create_local_cpu_memory_allocator(); - int embed_dim = 32; - int num_heads = 10; + nonnegative_int embed_dim = 32_n; + nonnegative_int num_heads = 10_n; - size_t batch_size = 40; - size_t seq_len = 48; - size_t feature_size = 36; + nonnegative_int batch_size = 40_n; + nonnegative_int seq_len = 48_n; + nonnegative_int feature_size = 36_n; DataType dtype = DataType::FLOAT; TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, seq_len, feature_size}}, + TensorDims{ + FFOrdered{batch_size, seq_len, feature_size}}, DataType::FLOAT, }; TensorShape query_shape = input_tensor_shape; diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc index f52fccb1ed..0fab0f6a60 100644 --- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc +++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc @@ -9,16 +9,17 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("LocalTaskArgumentAccessor") { Allocator allocator = create_local_cpu_memory_allocator(); - int embed_dim = 32; - int num_heads = 10; + nonnegative_int embed_dim = 32_n; + nonnegative_int num_heads = 10_n; - size_t batch_size = 40; - size_t seq_len = 48; - size_t feature_size = 36; + nonnegative_int batch_size = 40_n; + nonnegative_int seq_len = 48_n; + nonnegative_int feature_size = 36_n; DataType dtype = DataType::FLOAT; TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, seq_len, feature_size}}, + TensorDims{ + FFOrdered{batch_size, seq_len, feature_size}}, DataType::FLOAT, }; diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc index e18b7ea2de..58d6d9be6c 100644 --- a/lib/local-execution/test/src/test_task_registry.cc +++ b/lib/local-execution/test/src/test_task_registry.cc @@ -14,8 +14,8 @@ TEST_SUITE(FF_TEST_SUITE) { TaskRegistry task_registry = empty_task_registry(); layer_guid_t layer_guid = layer_guid_t{Node{0}}; - int embed_dim = 32; - int num_heads = 10; + nonnegative_int embed_dim = 32_n; + nonnegative_int num_heads = 10_n; ComputationGraphOpAttrs attrs = ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ /*embed_dim=*/embed_dim, @@ -76,7 +76,7 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(correct_task_mapping == task_registry.task_mapping); } SUBCASE("different attrs, still same task fn mapping") { - int embed_dim = 100; + nonnegative_int embed_dim = 100_n; layer_guid_t layer_3 = layer_guid_t{Node{3}}; ComputationGraphOpAttrs other_attrs = ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ @@ -98,7 +98,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("equality") { TaskRegistry other_task_registry = empty_task_registry(); SUBCASE("different attrs is still equal") { - int embed_dim = 100; + nonnegative_int embed_dim = 100_n; ComputationGraphOpAttrs other_attrs = ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ /*embed_dim=*/embed_dim, diff --git a/lib/models/include/models/bert/bert_config.struct.toml b/lib/models/include/models/bert/bert_config.struct.toml index 398210cf48..cc2a8eb0a7 100644 --- a/lib/models/include/models/bert/bert_config.struct.toml +++ b/lib/models/include/models/bert/bert_config.struct.toml @@ -12,27 +12,28 @@ features = [ includes = [ "op-attrs/activation.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] name = "vocab_size" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "hidden_size" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "num_encoder_layers" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "num_heads" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "dim_feedforward" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "hidden_act" @@ -64,8 +65,8 @@ type = "float" [[fields]] name = "sequence_length" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "batch_size" -type = "size_t" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/models/include/models/candle_uno/candle_uno_config.struct.toml b/lib/models/include/models/candle_uno/candle_uno_config.struct.toml index 667a6531c3..e7d83efd07 100644 --- a/lib/models/include/models/candle_uno/candle_uno_config.struct.toml +++ b/lib/models/include/models/candle_uno/candle_uno_config.struct.toml @@ -14,6 +14,7 @@ includes = [ "", "", "", + "utils/nonnegative_int/nonnegative_int.h", ] src_includes = [ @@ -25,19 +26,19 @@ src_includes = [ [[fields]] name = "batch_size" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "dense_layers" -type = "std::vector" +type = "std::vector<::FlexFlow::nonnegative_int>" [[fields]] name = "dense_feature_layers" -type = "std::vector" +type = "std::vector<::FlexFlow::nonnegative_int>" [[fields]] name = "feature_shapes" -type = "std::map" +type = "std::map" [[fields]] name = "input_features" diff --git a/lib/models/include/models/inception_v3/inception_v3_config.struct.toml b/lib/models/include/models/inception_v3/inception_v3_config.struct.toml index a2a75c83bb..1290420e16 100644 --- a/lib/models/include/models/inception_v3/inception_v3_config.struct.toml +++ b/lib/models/include/models/inception_v3/inception_v3_config.struct.toml @@ -10,13 +10,17 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + [[fields]] name = "num_classes" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "batch_size" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "aux_logits" diff --git a/lib/models/include/models/split_test/split_test.h b/lib/models/include/models/split_test/split_test.h index b03e45b2d2..dd7089c4f6 100644 --- a/lib/models/include/models/split_test/split_test.h +++ b/lib/models/include/models/split_test/split_test.h @@ -12,7 +12,7 @@ namespace FlexFlow { * @note This is a tiny model developed for testing the original Unity * implementation. It is not a "real" model and has never been trained. */ -ComputationGraph get_split_test_computation_graph(int batch_size); +ComputationGraph get_split_test_computation_graph(nonnegative_int batch_size); } // namespace FlexFlow diff --git a/lib/models/include/models/transformer/transformer_config.struct.toml b/lib/models/include/models/transformer/transformer_config.struct.toml index 23b0478dde..2a0b39feb9 100644 --- a/lib/models/include/models/transformer/transformer_config.struct.toml +++ b/lib/models/include/models/transformer/transformer_config.struct.toml @@ -1,6 +1,5 @@ namespace = "FlexFlow" name = "TransformerConfig" - features = [ "eq", "ord", @@ -10,33 +9,37 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + [[fields]] name = "num_features" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "sequence_length" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "batch_size" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "dim_feedforward" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "num_heads" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "num_encoder_layers" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "num_decoder_layers" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "dropout" @@ -48,4 +51,4 @@ type = "float" [[fields]] name = "vocab_size" -type = "size_t" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/models/src/models/bert/bert.cc b/lib/models/src/models/bert/bert.cc index cf48f2399b..a5d63e8fdc 100644 --- a/lib/models/src/models/bert/bert.cc +++ b/lib/models/src/models/bert/bert.cc @@ -6,20 +6,22 @@ namespace FlexFlow { BertConfig get_default_bert_config() { - return BertConfig{/*vocab_size=*/30522, - /*hidden_size=*/768, - /*num_encoder_layers=*/12, - /*num_heads=*/12, - /*dim_feedforward=*/3072, - /*hidden_act=*/Activation::GELU, - /*hidden_dropout_prob=*/0.1, - /*attention_probs_dropout_prob=*/0.1, - /*initializer_range=*/0.02, - /*layer_norm_eps=*/1e-12, - /*position_embedding_type=*/"absolute", - /*classifier_dropout=*/0.1, - /*sequence_length=*/512, - /*batch_size=*/64}; + return BertConfig{ + /*vocab_size=*/30522_n, + /*hidden_size=*/768_n, + /*num_encoder_layers=*/12_n, + /*num_heads=*/12_n, + /*dim_feedforward=*/3072_n, + /*hidden_act=*/Activation::GELU, + /*hidden_dropout_prob=*/0.1, + /*attention_probs_dropout_prob=*/0.1, + /*initializer_range=*/0.02, + /*layer_norm_eps=*/1e-12, + /*position_embedding_type=*/"absolute", + /*classifier_dropout=*/0.1, + /*sequence_length=*/512_n, + /*batch_size=*/64_n, + }; } tensor_guid_t @@ -56,9 +58,10 @@ tensor_guid_t InitializerAttrs const &bias_initializer, InitializerAttrs const &projection_initializer) { assert(num_dims(cgb.get_shape(input)) == 3); - std::vector layer_norm_axis = {2}; // Apply layernorm across the last dim - int kdim = config.dim_feedforward / config.num_heads; - int vdim = config.dim_feedforward / config.num_heads; + std::vector layer_norm_axis = { + relative_ff_dim_t{-1}}; // Apply layernorm across the last dim + nonnegative_int kdim = config.dim_feedforward / config.num_heads; + nonnegative_int vdim = config.dim_feedforward / config.num_heads; tensor_guid_t self_attention = cgb.multihead_attention(input, input, @@ -127,7 +130,7 @@ ComputationGraph get_bert_computation_graph(BertConfig const &config) { InitializerAttrs bias_initializer = InitializerAttrs{ZeroInitializerAttrs{}}; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ config.batch_size, config.sequence_length, config.hidden_size}}, DataType::FLOAT, }; @@ -149,7 +152,7 @@ ComputationGraph get_bert_computation_graph(BertConfig const &config) { assert( (cgb.get_shape(out_prob) == TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ config.batch_size, config.sequence_length, config.vocab_size}}, DataType::FLOAT, })); diff --git a/lib/models/src/models/candle_uno/candle_uno.cc b/lib/models/src/models/candle_uno/candle_uno.cc index 4d52d515fb..60422359a5 100644 --- a/lib/models/src/models/candle_uno/candle_uno.cc +++ b/lib/models/src/models/candle_uno/candle_uno.cc @@ -1,32 +1,34 @@ #include "models/candle_uno/candle_uno.h" #include "pcg/initializers/glorot_normal_attrs.dtg.h" +#include "utils/containers/repeat_element.h" namespace FlexFlow { CandleUnoConfig get_default_candle_uno_config() { - CandleUnoConfig config{ - /*batch_size=*/64, - /*dense_layers=*/std::vector(4, 4192), - /*dense_feature_layers=*/std::vector(8, 4192), - /*feature_shapes=*/std::map{}, - /*input_features=*/std::map{}, + return CandleUnoConfig{ + /*batch_size=*/64_n, + /*dense_layers=*/repeat_element(/*num_times=*/4_n, /*element=*/4192_n), + /*dense_feature_layers=*/ + repeat_element(/*num_times=*/8_n, /*element=*/4192_n), + /*feature_shapes=*/ + { + {"dose", 1_n}, + {"cell.rnaseq", 942_n}, + {"drug.descriptors", 5270_n}, + {"drug.fingerprints", 2048_n}, + }, + /*input_features=*/ + { + {"dose1", "dose"}, + {"dose2", "dose"}, + {"cell.rnaseq", "cell.rnaseq"}, + {"drug1.descriptors", "drug.descriptors"}, + {"drug1.fingerprints", "drug.fingerprints"}, + {"drug2.descriptors", "drug.descriptors"}, + {"drug2.fingerprints", "drug.fingerprints"}, + }, /*dropout=*/0.1, /*residual=*/false}; - - config.feature_shapes["dose"] = 1; - config.feature_shapes["cell.rnaseq"] = 942; - config.feature_shapes["drug.descriptors"] = 5270; - config.feature_shapes["drug.fingerprints"] = 2048; - - config.input_features["dose1"] = "dose"; - config.input_features["dose2"] = "dose"; - config.input_features["cell.rnaseq"] = "cell.rnaseq"; - config.input_features["drug1.descriptors"] = "drug.descriptors"; - config.input_features["drug1.fingerprints"] = "drug.fingerprints"; - config.input_features["drug2.descriptors"] = "drug.descriptors"; - config.input_features["drug2.fingerprints"] = "drug.fingerprints"; - - return config; } tensor_guid_t create_candle_uno_feature_model( @@ -35,7 +37,7 @@ tensor_guid_t create_candle_uno_feature_model( tensor_guid_t const &input, InitializerAttrs const &kernel_initializer) { tensor_guid_t t = input; - for (int const dense_dim : config.dense_feature_layers) { + for (nonnegative_int dense_dim : config.dense_feature_layers) { t = cgb.dense(t, dense_dim, Activation::RELU, @@ -56,7 +58,7 @@ ComputationGraph InitializerAttrs{GlorotNormalAttrs{/*seed=*/0}}; auto create_input_tensor = - [&](FFOrdered const &dims) -> tensor_guid_t { + [&](FFOrdered const &dims) -> tensor_guid_t { TensorShape input_shape = TensorShape{ TensorDims{dims}, DataType::FLOAT, @@ -82,7 +84,7 @@ ComputationGraph for (auto const &input_feature : config.input_features) { std::string const &feature_name = input_feature.second; - size_t shape = config.feature_shapes.at(feature_name); + nonnegative_int shape = config.feature_shapes.at(feature_name); tensor_guid_t input = create_input_tensor({config.batch_size, shape}); all_inputs.push_back(input); @@ -94,8 +96,9 @@ ComputationGraph } } - tensor_guid_t output = cgb.concat(encoded_inputs, /*axis=*/1); - for (int const &dense_layer_dim : config.dense_layers) { + tensor_guid_t output = + cgb.concat(encoded_inputs, /*axis=*/relative_ff_dim_t{1}); + for (nonnegative_int dense_layer_dim : config.dense_layers) { tensor_guid_t residual_input = output; output = cgb.dense(output, dense_layer_dim, @@ -111,7 +114,7 @@ ComputationGraph } } output = cgb.dense(output, - /*outDim=*/1, + /*outDim=*/1_n, /*activation=*/std::nullopt, /*use_bias=*/false, /*data_type=*/DataType::FLOAT, diff --git a/lib/models/src/models/inception_v3/inception_v3.cc b/lib/models/src/models/inception_v3/inception_v3.cc index f540eae629..3a829f3754 100644 --- a/lib/models/src/models/inception_v3/inception_v3.cc +++ b/lib/models/src/models/inception_v3/inception_v3.cc @@ -15,14 +15,17 @@ struct CheckShape { ComputationGraphBuilder const &cgb; InceptionV3Config const &config; - void operator()(tensor_guid_t t, int c, int h, int w) const { + void operator()(tensor_guid_t t, + nonnegative_int c, + nonnegative_int h, + nonnegative_int w) const { TensorShape current_shape = cgb.get_shape(t); TensorShape expected_shape = TensorShape{ - TensorDims{FFOrdered{ - size_t_from_int(config.batch_size), - size_t_from_int(c), - size_t_from_int(h), - size_t_from_int(w), + TensorDims{FFOrdered{ + config.batch_size, + c, + h, + w, }}, DataType::FLOAT, }; @@ -35,12 +38,12 @@ struct CheckShape { } } - void operator()(tensor_guid_t t, int c) const { + void operator()(tensor_guid_t t, nonnegative_int c) const { TensorShape current_shape = cgb.get_shape(t); TensorShape expected_shape = TensorShape{ - TensorDims{FFOrdered{ - size_t_from_int(config.batch_size), - size_t_from_int(c), + TensorDims{FFOrdered{ + config.batch_size, + c, }}, DataType::FLOAT, }; @@ -56,11 +59,11 @@ struct CheckShape { InceptionV3Config get_default_inception_v3_training_config() { return InceptionV3Config{ - /*num_classes=*/1000, + /*num_classes=*/1000_n, // see section 8 of https://arxiv.org/abs/1512.00567 for the source of the // batch size - /*batch_size=*/32, + /*batch_size=*/32_n, // see section 4 of https://arxiv.org/abs/1512.00567 for a discussion of // auxiliary logits. they are used by default in training @@ -70,13 +73,13 @@ InceptionV3Config get_default_inception_v3_training_config() { static tensor_guid_t create_conv_block(ComputationGraphBuilder &cgb, tensor_guid_t const &input, - int filters, - int kernel_size_h, - int kernel_size_w, - int stride_h = 1, - int stride_w = 1, - int padding_h = 0, - int padding_w = 0, + nonnegative_int filters, + nonnegative_int kernel_size_h, + nonnegative_int kernel_size_w, + nonnegative_int stride_h = 1_n, + nonnegative_int stride_w = 1_n, + nonnegative_int padding_h = 0_n, + nonnegative_int padding_w = 0_n, bool use_bias = false) { tensor_guid_t conv = cgb.conv2d(input, /*outChannels=*/filters, @@ -87,7 +90,7 @@ static tensor_guid_t create_conv_block(ComputationGraphBuilder &cgb, /*paddingH=*/padding_h, /*paddingW=*/padding_w, /*activation=*/std::nullopt, - /*groups=*/1, + /*groups=*/1_n, /*use_bias=*/use_bias); return cgb.batch_norm(conv, /*affine=*/true, @@ -98,29 +101,29 @@ static tensor_guid_t create_conv_block(ComputationGraphBuilder &cgb, static tensor_guid_t create_inception_module_a(ComputationGraphBuilder &cgb, tensor_guid_t const &input, - int pool_features) { + nonnegative_int pool_features) { tensor_guid_t branch1x1 = create_conv_block(cgb, input, - /*filters=*/64, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); + /*filters=*/64_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); tensor_guid_t branch5x5 = [&] { tensor_guid_t t = input; t = create_conv_block(cgb, t, - /*filters=*/48, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); + /*filters=*/48_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); t = create_conv_block(cgb, t, - /*filters=*/64, - /*kernel_size_h=*/5, - /*kernel_size_w=*/5, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/2, - /*padding_w=*/2); + /*filters=*/64_n, + /*kernel_size_h=*/5_n, + /*kernel_size_w=*/5_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/2_n, + /*padding_w=*/2_n); return t; }(); @@ -128,208 +131,209 @@ static tensor_guid_t create_inception_module_a(ComputationGraphBuilder &cgb, tensor_guid_t t = input; t = create_conv_block(cgb, t, - /*filters=*/64, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); + /*filters=*/64_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); t = create_conv_block(cgb, t, - /*filters=*/96, - /*kernel_size_h=*/3, - /*kernel_size_w=*/3, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/1, - /*padding_w=*/1); + /*filters=*/96_n, + /*kernel_size_h=*/3_n, + /*kernel_size_w=*/3_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/1_n, + /*padding_w=*/1_n); t = create_conv_block(cgb, t, - /*filters=*/96, - /*kernel_size_h=*/3, - /*kernel_size_w=*/3, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/1, - /*padding_w=*/1); + /*filters=*/96_n, + /*kernel_size_h=*/3_n, + /*kernel_size_w=*/3_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/1_n, + /*padding_w=*/1_n); return t; }(); tensor_guid_t branch_pool = [&] { tensor_guid_t t = input; t = cgb.pool2d(t, - /*kernelH=*/3, - /*kernelW=*/3, - /*strideH=*/1, - /*strideW=*/1, - /*paddingH=*/1, - /*paddingW=*/1, + /*kernelH=*/3_n, + /*kernelW=*/3_n, + /*strideH=*/1_n, + /*strideW=*/1_n, + /*paddingH=*/1_n, + /*paddingW=*/1_n, /*type=*/PoolOp::AVG); t = create_conv_block(cgb, t, /*filters=*/pool_features, - /*kernel_stride_h=*/1, - /*kernel_stride_w=*/1); + /*kernel_stride_h=*/1_n, + /*kernel_stride_w=*/1_n); return t; }(); return cgb.concat({branch1x1, branch5x5, branch3x3dbl, branch_pool}, - /*axis=*/1); + /*axis=*/relative_ff_dim_t{1}); } static tensor_guid_t create_inception_module_b(ComputationGraphBuilder &cgb, tensor_guid_t const &input) { tensor_guid_t branch3x3 = create_conv_block(cgb, input, - /*filters=*/384, - /*kernel_size_h=*/3, - /*kernel_size_w=*/3, - /*stride_h=*/2, - /*stride_w=*/2); + /*filters=*/384_n, + /*kernel_size_h=*/3_n, + /*kernel_size_w=*/3_n, + /*stride_h=*/2_n, + /*stride_w=*/2_n); tensor_guid_t branch3x3dbl = [&] { tensor_guid_t t = input; t = create_conv_block(cgb, t, - /*filters=*/64, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); + /*filters=*/64_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); t = create_conv_block(cgb, t, - /*filters=*/96, - /*kernel_size_h=*/3, - /*kernel_size_w=*/3, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/1, - /*padding_w=*/1); + /*filters=*/96_n, + /*kernel_size_h=*/3_n, + /*kernel_size_w=*/3_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/1_n, + /*padding_w=*/1_n); t = create_conv_block(cgb, t, - /*filters=*/96, - /*kernel_stride_h=*/3, - /*kernel_stride_w=*/3, - /*stride_h=*/2, - /*stride_w=*/2); + /*filters=*/96_n, + /*kernel_stride_h=*/3_n, + /*kernel_stride_w=*/3_n, + /*stride_h=*/2_n, + /*stride_w=*/2_n); return t; }(); tensor_guid_t branch_pool = cgb.pool2d(input, - /*kernelH=*/3, - /*kernelW=*/3, - /*strideH=*/2, - /*strideW=*/2, - /*paddingH=*/0, - /*paddingW=*/0, + /*kernelH=*/3_n, + /*kernelW=*/3_n, + /*strideH=*/2_n, + /*strideW=*/2_n, + /*paddingH=*/0_n, + /*paddingW=*/0_n, /*type=*/PoolOp::MAX); - return cgb.concat({branch3x3, branch3x3dbl, branch_pool}, /*axis=*/1); + return cgb.concat({branch3x3, branch3x3dbl, branch_pool}, + /*axis=*/relative_ff_dim_t{1}); } static tensor_guid_t create_inception_module_c(ComputationGraphBuilder &cgb, CheckShape const &check_shape, tensor_guid_t const &input, - int channels_7x7) { + nonnegative_int channels_7x7) { tensor_guid_t branch1x1 = create_conv_block(cgb, input, - /*filters=*/192, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); - check_shape(branch1x1, 192, 17, 17); + /*filters=*/192_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); + check_shape(branch1x1, 192_n, 17_n, 17_n); tensor_guid_t branch7x7 = [&] { tensor_guid_t t = input; t = create_conv_block(cgb, t, /*filters=*/channels_7x7, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); t = create_conv_block(cgb, t, /*filters=*/channels_7x7, - /*kernel_size_h=*/1, - /*kernel_size_w=*/7, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/0, - /*padding_w=*/3); + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/7_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/0_n, + /*padding_w=*/3_n); t = create_conv_block(cgb, t, - /*filters=*/192, - /*kernel_size_h=*/7, - /*kernel_size_w=*/1, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/3, - /*padding_w=*/0); + /*filters=*/192_n, + /*kernel_size_h=*/7_n, + /*kernel_size_w=*/1_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/3_n, + /*padding_w=*/0_n); return t; }(); - check_shape(branch7x7, 192, 17, 17); + check_shape(branch7x7, 192_n, 17_n, 17_n); tensor_guid_t branch7x7dbl = [&] { tensor_guid_t t = input; t = create_conv_block(cgb, t, /*filters=*/channels_7x7, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); t = create_conv_block(cgb, t, /*filters=*/channels_7x7, - /*kernel_size_h=*/7, - /*kernel_size_w=*/1, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/3, - /*padding_w=*/0); + /*kernel_size_h=*/7_n, + /*kernel_size_w=*/1_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/3_n, + /*padding_w=*/0_n); t = create_conv_block(cgb, t, /*filters=*/channels_7x7, - /*kernel_size_h=*/1, - /*kernel_size_w=*/7, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/0, - /*padding_w=*/3); + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/7_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/0_n, + /*padding_w=*/3_n); t = create_conv_block(cgb, t, /*filters=*/channels_7x7, - /*kernel_size_h=*/7, - /*kernel_size_w=*/1, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/3, - /*padding_w=*/0); + /*kernel_size_h=*/7_n, + /*kernel_size_w=*/1_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/3_n, + /*padding_w=*/0_n); t = create_conv_block(cgb, t, - /*filters=*/192, - /*kernel_size_h=*/1, - /*kernel_size_w=*/7, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/0, - /*padding_w=*/3); + /*filters=*/192_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/7_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/0_n, + /*padding_w=*/3_n); return t; }(); - check_shape(branch7x7dbl, 192, 17, 17); + check_shape(branch7x7dbl, 192_n, 17_n, 17_n); tensor_guid_t branch_pool = [&] { tensor_guid_t t = input; t = cgb.pool2d(t, - /*kernelH=*/3, - /*kernelW=*/3, - /*strideH=*/1, - /*strideW=*/1, - /*paddingH=*/1, - /*paddingW=*/1, + /*kernelH=*/3_n, + /*kernelW=*/3_n, + /*strideH=*/1_n, + /*strideW=*/1_n, + /*paddingH=*/1_n, + /*paddingW=*/1_n, /*type=*/PoolOp::AVG); t = create_conv_block(cgb, t, - /*filters=*/192, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); + /*filters=*/192_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); return t; }(); - check_shape(branch_pool, 192, 17, 17); + check_shape(branch_pool, 192_n, 17_n, 17_n); return cgb.concat({branch1x1, branch7x7, branch7x7dbl, branch_pool}, - /*axis=*/1); + /*axis=*/relative_ff_dim_t{1}); } static tensor_guid_t create_inception_module_d(ComputationGraphBuilder &cgb, @@ -338,10 +342,10 @@ static tensor_guid_t create_inception_module_d(ComputationGraphBuilder &cgb, tensor_guid_t t = input; t = create_conv_block(cgb, t, - /*filters=*/192, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); - t = create_conv_block(cgb, t, 320, 3, 3, 2, 2); + /*filters=*/192_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); + t = create_conv_block(cgb, t, 320_n, 3_n, 3_n, 2_n, 2_n); return t; }(); @@ -349,83 +353,84 @@ static tensor_guid_t create_inception_module_d(ComputationGraphBuilder &cgb, tensor_guid_t t = input; t = create_conv_block(cgb, t, - /*filters=*/192, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); + /*filters=*/192_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); t = create_conv_block(cgb, t, - /*filters=*/192, - /*kernel_size_h=*/1, - /*kernel_size_w=*/7, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/0, - /*padding_w=*/3); + /*filters=*/192_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/7_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/0_n, + /*padding_w=*/3_n); t = create_conv_block(cgb, t, - /*filters=*/192, - /*kernel_size_h=*/7, - /*kernel_size_w=*/1, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/3, - /*padding_w=*/0); + /*filters=*/192_n, + /*kernel_size_h=*/7_n, + /*kernel_size_w=*/1_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/3_n, + /*padding_w=*/0_n); t = create_conv_block(cgb, t, - /*filters=*/192, - /*kernel_size_h=*/3, - /*kernel_size_w=*/3, - /*stride_h=*/2, - /*stride_w=*/2); + /*filters=*/192_n, + /*kernel_size_h=*/3_n, + /*kernel_size_w=*/3_n, + /*stride_h=*/2_n, + /*stride_w=*/2_n); return t; }(); tensor_guid_t branch_pool = cgb.pool2d(input, - /*kernelH=*/3, - /*kernelW=*/3, - /*strideH=*/2, - /*strideW=*/2, - /*paddingH=*/0, - /*paddingW=*/0, + /*kernelH=*/3_n, + /*kernelW=*/3_n, + /*strideH=*/2_n, + /*strideW=*/2_n, + /*paddingH=*/0_n, + /*paddingW=*/0_n, /*type=*/PoolOp::MAX); - return cgb.concat({branch3x3, branch7x7x3, branch_pool}, /*axis=*/1); + return cgb.concat({branch3x3, branch7x7x3, branch_pool}, + /*axis=*/relative_ff_dim_t{1}); } static tensor_guid_t create_inception_module_e(ComputationGraphBuilder &cgb, tensor_guid_t const &input) { tensor_guid_t branch1x1 = create_conv_block(cgb, input, - /*filters=*/320, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); + /*filters=*/320_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); tensor_guid_t branch3x3 = [&] { tensor_guid_t t = input; t = create_conv_block(cgb, t, - /*filters=*/384, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); + /*filters=*/384_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); tensor_guid_t t_1 = create_conv_block(cgb, t, - /*filters=*/384, - /*kernel_size_h=*/1, - /*kernel_size_w=*/3, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/0, - /*padding_w=*/1); + /*filters=*/384_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/3_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/0_n, + /*padding_w=*/1_n); tensor_guid_t t_2 = create_conv_block(cgb, t, - /*filters=*/384, - /*kernel_size_h=*/3, - /*kernel_size_w=*/1, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/1, - /*padding_w=*/0); - t = cgb.concat({t_1, t_2}, /*axis=*/1); + /*filters=*/384_n, + /*kernel_size_h=*/3_n, + /*kernel_size_w=*/1_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/1_n, + /*padding_w=*/0_n); + t = cgb.concat({t_1, t_2}, /*axis=*/relative_ff_dim_t{1}); return t; }(); @@ -433,60 +438,60 @@ static tensor_guid_t create_inception_module_e(ComputationGraphBuilder &cgb, tensor_guid_t t = input; t = create_conv_block(cgb, t, - /*filters=*/448, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); + /*filters=*/448_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); t = create_conv_block(cgb, t, - /*filters=*/384, - /*kernel_size_h=*/3, - /*kernel_size_w=*/3, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/1, - /*padding_w=*/1); + /*filters=*/384_n, + /*kernel_size_h=*/3_n, + /*kernel_size_w=*/3_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/1_n, + /*padding_w=*/1_n); tensor_guid_t t_1 = create_conv_block(cgb, t, - /*filters=*/384, - /*kernel_size_h=*/1, - /*kernel_size_w=*/3, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/0, - /*padding_w=*/1); + /*filters=*/384_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/3_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/0_n, + /*padding_w=*/1_n); tensor_guid_t t_2 = create_conv_block(cgb, t, - /*filters=*/384, - /*kernel_size_h=*/3, - /*kernel_size_w=*/1, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/1, - /*padding_w=*/0); - t = cgb.concat({t_1, t_2}, /*axis=*/1); + /*filters=*/384_n, + /*kernel_size_h=*/3_n, + /*kernel_size_w=*/1_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/1_n, + /*padding_w=*/0_n); + t = cgb.concat({t_1, t_2}, /*axis=*/relative_ff_dim_t{1}); return t; }(); tensor_guid_t branch_pool = [&] { tensor_guid_t t = input; t = cgb.pool2d(t, - /*kernelH=*/3, - /*kernelW=*/3, - /*strideH=*/1, - /*strideW=*/1, - /*paddingH=*/1, - /*paddingW=*/1, + /*kernelH=*/3_n, + /*kernelW=*/3_n, + /*strideH=*/1_n, + /*strideW=*/1_n, + /*paddingH=*/1_n, + /*paddingW=*/1_n, /*type=*/PoolOp::AVG); t = create_conv_block(cgb, t, - /*filters=*/192, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); + /*filters=*/192_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); return t; }(); return cgb.concat({branch1x1, branch3x3, branch3x3dbl, branch_pool}, - /*axis=*/1); + /*axis=*/relative_ff_dim_t{1}); } static tensor_guid_t create_initial_layers(ComputationGraphBuilder &cgb, @@ -494,75 +499,75 @@ static tensor_guid_t create_initial_layers(ComputationGraphBuilder &cgb, tensor_guid_t const &input) { tensor_guid_t t = input; - check_shape(t, 3, 299, 299); + check_shape(t, 3_n, 299_n, 299_n); // Conv2d_1a_3x3 t = create_conv_block(cgb, t, - /*filters=*/32, - /*kernel_size_h=*/3, - /*kernel_size_w=*/3, - /*stride_h=*/2, - /*stride_w=*/2); - check_shape(t, 32, 149, 149); + /*filters=*/32_n, + /*kernel_size_h=*/3_n, + /*kernel_size_w=*/3_n, + /*stride_h=*/2_n, + /*stride_w=*/2_n); + check_shape(t, 32_n, 149_n, 149_n); // Conv2d_2a_3x3 t = create_conv_block(cgb, t, - /*filters=*/32, - /*kernel_size_h=*/3, - /*kernel_size_w=*/3); - check_shape(t, 32, 147, 147); + /*filters=*/32_n, + /*kernel_size_h=*/3_n, + /*kernel_size_w=*/3_n); + check_shape(t, 32_n, 147_n, 147_n); // Conv2d_2b_3x3 t = create_conv_block(cgb, t, - /*filters=*/64, - /*kernel_size_h=*/3, - /*kernel_size_w=*/3, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/1, - /*padding_w=*/1); - check_shape(t, 64, 147, 147); + /*filters=*/64_n, + /*kernel_size_h=*/3_n, + /*kernel_size_w=*/3_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/1_n, + /*padding_w=*/1_n); + check_shape(t, 64_n, 147_n, 147_n); // maxpool1 t = cgb.pool2d(t, - /*kernelH=*/3, - /*kernelW=*/3, - /*strideH=*/2, - /*strideW=*/2, - /*paddingH=*/0, - /*paddingW=*/0, + /*kernelH=*/3_n, + /*kernelW=*/3_n, + /*strideH=*/2_n, + /*strideW=*/2_n, + /*paddingH=*/0_n, + /*paddingW=*/0_n, /*type=*/PoolOp::MAX); - check_shape(t, 64, 73, 73); + check_shape(t, 64_n, 73_n, 73_n); // Conv2d_3b_1x1 t = create_conv_block(cgb, t, - /*filters=*/80, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); - check_shape(t, 80, 73, 73); + /*filters=*/80_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); + check_shape(t, 80_n, 73_n, 73_n); // Conv2d_4a_3x3 t = create_conv_block(cgb, t, - /*filters=*/192, - /*kernel_size_h=*/3, - /*kernel_size_w=*/3); - check_shape(t, 192, 71, 71); + /*filters=*/192_n, + /*kernel_size_h=*/3_n, + /*kernel_size_w=*/3_n); + check_shape(t, 192_n, 71_n, 71_n); // maxpool2 t = cgb.pool2d(t, - /*kernelH=*/3, - /*kernelW=*/3, - /*strideH=*/2, - /*strideW=*/2, - /*paddingH=*/0, - /*paddingW=*/0, + /*kernelH=*/3_n, + /*kernelW=*/3_n, + /*strideH=*/2_n, + /*strideW=*/2_n, + /*paddingH=*/0_n, + /*paddingW=*/0_n, /*type=*/PoolOp::MAX); - check_shape(t, 192, 35, 35); + check_shape(t, 192_n, 35_n, 35_n); return t; } @@ -570,26 +575,26 @@ static tensor_guid_t create_initial_layers(ComputationGraphBuilder &cgb, static tensor_guid_t create_final_layers(ComputationGraphBuilder &cgb, CheckShape const &check_shape, tensor_guid_t const &input, - size_t num_classes) { + nonnegative_int num_classes) { // avgpool tensor_guid_t x = cgb.pool2d(input, - /*kernelH=*/8, - /*kernelW=*/8, - /*strideH=*/1, - /*strideW=*/1, - /*paddingH=*/0, - /*paddingW=*/0, + /*kernelH=*/8_n, + /*kernelW=*/8_n, + /*strideH=*/1_n, + /*strideW=*/1_n, + /*paddingH=*/0_n, + /*paddingW=*/0_n, /*type=*/PoolOp::AVG); - check_shape(x, 2048, 1, 1); + check_shape(x, 2048_n, 1_n, 1_n); // dropout x = cgb.dropout(x, /*rate=*/0.5); - check_shape(x, 2048, 1, 1); + check_shape(x, 2048_n, 1_n, 1_n); x = cgb.flat(x, - /*start_dim=*/1); - check_shape(x, 2048); + /*start_dim=*/relative_ff_dim_t{1}); + check_shape(x, 2048_n); // fc x = cgb.dense(x, @@ -597,7 +602,7 @@ static tensor_guid_t create_final_layers(ComputationGraphBuilder &cgb, check_shape(x, num_classes); // softmax (not in pytorch model, but shown in Table 1 on p6 of - // https://arxiv.org/abs/1512.00567) + // https://arxiv.org/abs/1512.00567_n) x = cgb.softmax(x); check_shape(x, num_classes); @@ -607,44 +612,44 @@ static tensor_guid_t create_final_layers(ComputationGraphBuilder &cgb, static tensor_guid_t create_inception_aux(ComputationGraphBuilder &cgb, CheckShape const &check_shape, tensor_guid_t const &input, - size_t num_classes) { + nonnegative_int num_classes) { tensor_guid_t x = input; - check_shape(x, 768, 17, 17); + check_shape(x, 768_n, 17_n, 17_n); x = cgb.pool2d(x, - /*kernelH=*/5, - /*kernelW=*/5, - /*strideH=*/3, - /*strideW=*/3, - /*paddingH=*/0, - /*paddingW=*/0, + /*kernelH=*/5_n, + /*kernelW=*/5_n, + /*strideH=*/3_n, + /*strideW=*/3_n, + /*paddingH=*/0_n, + /*paddingW=*/0_n, /*type=*/PoolOp::AVG); - check_shape(x, 768, 5, 5); + check_shape(x, 768_n, 5_n, 5_n); // conv0 x = create_conv_block(cgb, x, - /*filters=*/128, - /*kernel_size_h=*/1, - /*kernel_size_w=*/1); - check_shape(x, 128, 5, 5); + /*filters=*/128_n, + /*kernel_size_h=*/1_n, + /*kernel_size_w=*/1_n); + check_shape(x, 128_n, 5_n, 5_n); // conv1 x = create_conv_block(cgb, x, - /*filters=*/768, - /*kernel_size_h=*/5, - /*kernel_size_w=*/5); - check_shape(x, 768, 1, 1); + /*filters=*/768_n, + /*kernel_size_h=*/5_n, + /*kernel_size_w=*/5_n); + check_shape(x, 768_n, 1_n, 1_n); x = cgb.adaptive_pool2d(x, - /*output_h=*/1, - /*output_w=*/1); - check_shape(x, 768, 1, 1); + /*output_h=*/1_n, + /*output_w=*/1_n); + check_shape(x, 768_n, 1_n, 1_n); x = cgb.flat(x, - /*start_dim=*/1); - check_shape(x, 768); + /*start_dim=*/relative_ff_dim_t{1}); + check_shape(x, 768_n); // fc x = cgb.dense(x, @@ -666,39 +671,39 @@ static InceptionV3Output create_inception_v3(ComputationGraphBuilder &cgb, }; tensor_guid_t x = create_initial_layers(cgb, check_shape, input); - check_shape(x, 192, 35, 35); + check_shape(x, 192_n, 35_n, 35_n); // Mixed_5b - x = create_inception_module_a(cgb, x, 32); - check_shape(x, 256, 35, 35); + x = create_inception_module_a(cgb, x, 32_n); + check_shape(x, 256_n, 35_n, 35_n); // Mixed_5c - x = create_inception_module_a(cgb, x, 64); - check_shape(x, 288, 35, 35); + x = create_inception_module_a(cgb, x, 64_n); + check_shape(x, 288_n, 35_n, 35_n); // Mixed_5d - x = create_inception_module_a(cgb, x, 64); - check_shape(x, 288, 35, 35); + x = create_inception_module_a(cgb, x, 64_n); + check_shape(x, 288_n, 35_n, 35_n); // Mixed_6a x = create_inception_module_b(cgb, x); - check_shape(x, 768, 17, 17); + check_shape(x, 768_n, 17_n, 17_n); // Mixed_6b - x = create_inception_module_c(cgb, check_shape, x, 128); - check_shape(x, 768, 17, 17); + x = create_inception_module_c(cgb, check_shape, x, 128_n); + check_shape(x, 768_n, 17_n, 17_n); // Mixed_6c - x = create_inception_module_c(cgb, check_shape, x, 160); - check_shape(x, 768, 17, 17); + x = create_inception_module_c(cgb, check_shape, x, 160_n); + check_shape(x, 768_n, 17_n, 17_n); // Mixed_6d - x = create_inception_module_c(cgb, check_shape, x, 160); - check_shape(x, 768, 17, 17); + x = create_inception_module_c(cgb, check_shape, x, 160_n); + check_shape(x, 768_n, 17_n, 17_n); // Mixed_6e - x = create_inception_module_c(cgb, check_shape, x, 192); - check_shape(x, 768, 17, 17); + x = create_inception_module_c(cgb, check_shape, x, 192_n); + check_shape(x, 768_n, 17_n, 17_n); std::optional aux; if (config.aux_logits) { @@ -708,15 +713,15 @@ static InceptionV3Output create_inception_v3(ComputationGraphBuilder &cgb, // Mixed_7a x = create_inception_module_d(cgb, x); - check_shape(x, 1280, 8, 8); + check_shape(x, 1280_n, 8_n, 8_n); // Mixed_7b x = create_inception_module_e(cgb, x); - check_shape(x, 2048, 8, 8); + check_shape(x, 2048_n, 8_n, 8_n); // Mixed_7c x = create_inception_module_e(cgb, x); - check_shape(x, 2048, 8, 8); + check_shape(x, 2048_n, 8_n, 8_n); x = create_final_layers(cgb, check_shape, x, config.num_classes); check_shape(x, config.num_classes); @@ -732,11 +737,11 @@ ComputationGraph ComputationGraphBuilder cgb; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - size_t_from_int(config.batch_size), - 3, - 299, - 299, + TensorDims{FFOrdered{ + config.batch_size, + 3_n, + 299_n, + 299_n, }}, DataType::FLOAT, }; diff --git a/lib/models/src/models/split_test/split_test.cc b/lib/models/src/models/split_test/split_test.cc index 118f94ec06..d3876d8bfc 100644 --- a/lib/models/src/models/split_test/split_test.cc +++ b/lib/models/src/models/split_test/split_test.cc @@ -4,18 +4,18 @@ namespace FlexFlow { -ComputationGraph get_split_test_computation_graph(int batch_size) { +ComputationGraph get_split_test_computation_graph(nonnegative_int batch_size) { ComputationGraphBuilder cgb; - int layer_dim1 = 256; - int layer_dim2 = 128; - int layer_dim3 = 64; - int layer_dim4 = 32; + nonnegative_int layer_dim1 = 256_n; + nonnegative_int layer_dim2 = 128_n; + nonnegative_int layer_dim3 = 64_n; + nonnegative_int layer_dim4 = 32_n; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - size_t_from_int(batch_size), - size_t_from_int(layer_dim1), + TensorDims{FFOrdered{ + batch_size, + layer_dim1, }}, DataType::FLOAT, }; diff --git a/lib/models/src/models/transformer/transformer.cc b/lib/models/src/models/transformer/transformer.cc index 173a1b291c..f71763313a 100644 --- a/lib/models/src/models/transformer/transformer.cc +++ b/lib/models/src/models/transformer/transformer.cc @@ -4,16 +4,16 @@ namespace FlexFlow { TransformerConfig get_default_transformer_config() { - return TransformerConfig{/*num_features=*/512, - /*sequence_length=*/512, - /*batch_size=*/64, - /*dim_feedforward=*/2048, - /*num_heads=*/8, - /*num_encoder_layers=*/6, - /*num_decoder_layers=*/6, + return TransformerConfig{/*num_features=*/512_n, + /*sequence_length=*/512_n, + /*batch_size=*/64_n, + /*dim_feedforward=*/2048_n, + /*num_heads=*/8_n, + /*num_encoder_layers=*/6_n, + /*num_decoder_layers=*/6_n, /*dropout=*/0.1, /*layer_norm_eps=*/1e-05, - /*vocab_size=*/64}; + /*vocab_size=*/64_n}; } tensor_guid_t create_feedforward_network(ComputationGraphBuilder &cgb, @@ -32,18 +32,20 @@ tensor_guid_t create_feedforward_network(ComputationGraphBuilder &cgb, tensor_guid_t create_transformer_encoder_layer(ComputationGraphBuilder &cgb, TransformerConfig const &config, tensor_guid_t const &input) { - std::vector layer_norm_axis{2}; // Normalize the last dim - int kdim = config.dim_feedforward / config.num_heads; - int vdim = config.dim_feedforward / config.num_heads; - tensor_guid_t self_attention = cgb.multihead_attention(input, - input, - input, - config.num_features, - config.num_heads, - kdim, - vdim, - config.dropout, - /*bias=*/false); + std::vector layer_norm_axis = { + relative_ff_dim_t{-1}}; // Normalize the last dim + nonnegative_int kdim = config.dim_feedforward / config.num_heads; + nonnegative_int vdim = config.dim_feedforward / config.num_heads; + tensor_guid_t self_attention = + cgb.multihead_attention(/*query=*/input, + /*key=*/input, + /*value=*/input, + /*embed_dim=*/config.num_features, + /*num_heads=*/config.num_heads, + /*kdim=*/kdim, + /*vdim=*/vdim, + /*dropout=*/config.dropout, + /*bias=*/false); assert(are_tensor_guid_shapes_equivalent( cgb.computation_graph, input, self_attention)); @@ -79,18 +81,20 @@ tensor_guid_t TransformerConfig const &config, tensor_guid_t const &input, tensor_guid_t const &encoder_output) { - std::vector layer_norm_axis{2}; // Normalize the last dim - int kdim = config.dim_feedforward / config.num_heads; - int vdim = config.dim_feedforward / config.num_heads; - tensor_guid_t self_attention = cgb.multihead_attention(input, - input, - input, - config.num_features, - config.num_heads, - kdim, - vdim, - config.dropout, - /*bias=*/false); + std::vector layer_norm_axis = { + relative_ff_dim_t{-1}}; // Normalize the last dim + nonnegative_int kdim = config.dim_feedforward / config.num_heads; + nonnegative_int vdim = config.dim_feedforward / config.num_heads; + tensor_guid_t self_attention = + cgb.multihead_attention(/*query=*/input, + /*key=*/input, + /*value=*/input, + /*embed_dim=*/config.num_features, + /*num_heads=*/config.num_heads, + /*kdim=*/kdim, + /*vdim=*/vdim, + /*dropout=*/config.dropout, + /*bias=*/false); assert(are_tensor_guid_shapes_equivalent( cgb.computation_graph, input, self_attention)); @@ -102,15 +106,16 @@ tensor_guid_t assert(are_tensor_guid_shapes_equivalent( cgb.computation_graph, input, self_attention_normalized)); - tensor_guid_t mha = cgb.multihead_attention(self_attention_normalized, - encoder_output, - encoder_output, - config.num_features, - config.num_heads, - kdim, - vdim, - config.dropout, - /*bias=*/false); + tensor_guid_t mha = + cgb.multihead_attention(/*query=*/self_attention_normalized, + /*key=*/encoder_output, + /*value=*/encoder_output, + /*embed_dim=*/config.num_features, + /*num_heads=*/config.num_heads, + /*kdim=*/kdim, + /*vdim=*/vdim, + /*dropout=*/config.dropout, + /*bias=*/false); assert(are_tensor_guid_shapes_equivalent(cgb.computation_graph, input, mha)); tensor_guid_t mha_normalized = @@ -148,7 +153,7 @@ ComputationGraph ComputationGraphBuilder cgb; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ config.batch_size, config.sequence_length, config.num_features}}, DataType::FLOAT, }; diff --git a/lib/op-attrs/include/op-attrs/computation_graph_op_attrs.variant.toml b/lib/op-attrs/include/op-attrs/computation_graph_op_attrs.variant.toml index 014526a601..f1c5fe6b23 100644 --- a/lib/op-attrs/include/op-attrs/computation_graph_op_attrs.variant.toml +++ b/lib/op-attrs/include/op-attrs/computation_graph_op_attrs.variant.toml @@ -11,7 +11,7 @@ features = [ includes = [ "op-attrs/ops/attention_attrs.dtg.h", - "op-attrs/ops/batch_matmul.dtg.h", + "op-attrs/ops/batch_matmul_attrs.dtg.h", "op-attrs/ops/batch_norm_attrs.dtg.h", "op-attrs/ops/broadcast_attrs.dtg.h", "op-attrs/ops/cast_attrs.dtg.h", diff --git a/lib/op-attrs/include/op-attrs/datatype.h b/lib/op-attrs/include/op-attrs/datatype.h index 5af00fb510..3a817af38c 100644 --- a/lib/op-attrs/include/op-attrs/datatype.h +++ b/lib/op-attrs/include/op-attrs/datatype.h @@ -4,6 +4,7 @@ #include "op-attrs/datatype.dtg.h" #include "utils/fmt.h" #include "utils/fp16.h" +#include "utils/nonnegative_int/nonnegative_int.h" #include namespace FlexFlow { @@ -49,7 +50,7 @@ typename data_type_enum_to_class
::type cast_to(T t) { template using real_type_t = typename data_type_enum_to_class
::type; -size_t size_of_datatype(DataType); +nonnegative_int size_of_datatype(DataType); bool can_strictly_promote_datatype_from_to(DataType, DataType); diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h index 3977f4e0fd..f2355289dc 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h +++ b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h @@ -32,19 +32,13 @@ struct DimOrdered { : contents(contents.begin(), contents.end()) {} T const &at(Idx idx) const { - int raw = idx.value; - if (raw < 0) { - raw = this->contents.size() + raw; - } - return this->contents.at(raw); + nonnegative_int raw = idx.value; + return this->contents.at(raw.unwrap_nonnegative()); } T &at(Idx idx) { - int raw = idx.value; - if (raw < 0) { - raw = this->contents.size() + raw; - } - return this->contents.at(raw); + nonnegative_int raw = idx.value; + return this->contents.at(raw.unwrap_nonnegative()); } T const &operator[](Idx idx) const { @@ -56,11 +50,8 @@ struct DimOrdered { } bool idx_is_valid(Idx const &idx) const { - int raw = idx.value; - if (raw < 0) { - raw = this->contents.size() + raw; - } - return (raw >= 0 && raw < this->contents.size()); + nonnegative_int raw = idx.value; + return (raw < this->contents.size()); } bool operator==(DimOrdered const &other) const { @@ -172,7 +163,7 @@ struct DimOrdered { : contents(contents.begin(), contents.end()) {} T const &at(ff_dim_t idx) const { - int raw = idx.value.get_value(); + int raw = idx.value.unwrap_nonnegative(); return this->contents.at(raw); } @@ -185,7 +176,7 @@ struct DimOrdered { } T &at(ff_dim_t idx) { - int raw = idx.value.get_value(); + int raw = idx.value.unwrap_nonnegative(); return this->contents.at(raw); } @@ -214,7 +205,7 @@ struct DimOrdered { } bool idx_is_valid(ff_dim_t const &idx) const { - int raw = idx.value.get_value(); + int raw = idx.value.unwrap_nonnegative(); return raw < this->contents.size(); } diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h index c9e6db4d17..166916dd44 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h +++ b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h @@ -27,8 +27,8 @@ FFOrdered ff_dim_t_nonoverloaded_slice(FFOrdered const &d, std::optional const &end) { auto to_raw_idx = [](std::optional const &idx) -> std::optional { - return transform(idx, - [](ff_dim_t const &i) { return i.value.get_value(); }); + return transform( + idx, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); }); }; return FFOrdered{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))}; diff --git a/lib/op-attrs/include/op-attrs/get_op_type.h b/lib/op-attrs/include/op-attrs/get_op_type.h index b60880a98b..7799900709 100644 --- a/lib/op-attrs/include/op-attrs/get_op_type.h +++ b/lib/op-attrs/include/op-attrs/get_op_type.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_OP_ATTRS_GET_OP_TYPE_H #include "op-attrs/ops/attention_attrs.dtg.h" -#include "op-attrs/ops/batch_matmul.dtg.h" +#include "op-attrs/ops/batch_matmul_attrs.dtg.h" #include "op-attrs/ops/batch_norm_attrs.dtg.h" #include "op-attrs/ops/broadcast_attrs.dtg.h" #include "op-attrs/ops/cast_attrs.dtg.h" diff --git a/lib/op-attrs/include/op-attrs/ops/attention.h b/lib/op-attrs/include/op-attrs/ops/attention.h index e06d795c04..5f1b11c1bb 100644 --- a/lib/op-attrs/include/op-attrs/ops/attention.h +++ b/lib/op-attrs/include/op-attrs/ops/attention.h @@ -12,31 +12,31 @@ namespace FlexFlow { -int get_qProjSize(MultiHeadAttentionAttrs const &); -int get_vProjSize(MultiHeadAttentionAttrs const &); -int get_kProjSize(MultiHeadAttentionAttrs const &); -int get_oProjSize(MultiHeadAttentionAttrs const &); +nonnegative_int get_qProjSize(MultiHeadAttentionAttrs const &); +nonnegative_int get_vProjSize(MultiHeadAttentionAttrs const &); +nonnegative_int get_kProjSize(MultiHeadAttentionAttrs const &); +nonnegative_int get_oProjSize(MultiHeadAttentionAttrs const &); -int get_qSize(MultiHeadAttentionParallelInputs const &); -int get_qSize(MultiHeadAttentionInputs const &); +nonnegative_int get_qSize(MultiHeadAttentionParallelInputs const &); +nonnegative_int get_qSize(MultiHeadAttentionInputs const &); -int get_kSize(MultiHeadAttentionParallelInputs const &); -int get_kSize(MultiHeadAttentionInputs const &); +nonnegative_int get_kSize(MultiHeadAttentionParallelInputs const &); +nonnegative_int get_kSize(MultiHeadAttentionInputs const &); -int get_vSize(MultiHeadAttentionParallelInputs const &); -int get_vSize(MultiHeadAttentionInputs const &); +nonnegative_int get_vSize(MultiHeadAttentionParallelInputs const &); +nonnegative_int get_vSize(MultiHeadAttentionInputs const &); -int get_oSize(ParallelTensorShape const &); -int get_oSize(TensorShape const &); +nonnegative_int get_oSize(ParallelTensorShape const &); +nonnegative_int get_oSize(TensorShape const &); -int get_qoSeqLength(MultiHeadAttentionParallelInputs const &); -int get_qoSeqLength(MultiHeadAttentionInputs const &); +nonnegative_int get_qoSeqLength(MultiHeadAttentionParallelInputs const &); +nonnegative_int get_qoSeqLength(MultiHeadAttentionInputs const &); -int get_kvSeqLength(MultiHeadAttentionParallelInputs const &); -int get_kvSeqLength(MultiHeadAttentionInputs const &); +nonnegative_int get_kvSeqLength(MultiHeadAttentionParallelInputs const &); +nonnegative_int get_kvSeqLength(MultiHeadAttentionInputs const &); -int get_num_samples(MultiHeadAttentionParallelInputs const &); -int get_num_samples(MultiHeadAttentionInputs const &); +nonnegative_int get_num_samples(MultiHeadAttentionParallelInputs const &); +nonnegative_int get_num_samples(MultiHeadAttentionInputs const &); std::vector get_attention_incoming_tensor_roles(MultiHeadAttentionAttrs const &); diff --git a/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml b/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml index b82b285451..f85b7268af 100644 --- a/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml @@ -10,29 +10,29 @@ features = [ ] includes = [ - "", "op-attrs/datatype.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] name = "batch_size" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "sequence_length" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "query_size" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "key_size" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "value_size" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "datatype" diff --git a/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml index d96d8af69c..019131b07c 100644 --- a/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml @@ -10,21 +10,25 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + [[fields]] name = "embed_dim" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "num_heads" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "kdim" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "vdim" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "dropout" diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h index 574b4ef579..333da4fa29 100644 --- a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h +++ b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_BATCH_MATMUL_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_BATCH_MATMUL_H -#include "op-attrs/ops/batch_matmul.dtg.h" +#include "op-attrs/ops/batch_matmul_attrs.dtg.h" #include "op-attrs/ops/core.h" #include "op-attrs/parallel_tensor_shape.dtg.h" #include "op-attrs/tensor_shape.dtg.h" diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul.struct.toml b/lib/op-attrs/include/op-attrs/ops/batch_matmul.struct.toml deleted file mode 100644 index 3b1dd3f687..0000000000 --- a/lib/op-attrs/include/op-attrs/ops/batch_matmul.struct.toml +++ /dev/null @@ -1,19 +0,0 @@ -namespace = "FlexFlow" -name = "BatchMatmulAttrs" - -features = [ - "eq", - "ord", - "hash", - "json", - "rapidcheck", - "fmt", -] - -[[fields]] -name = "a_seq_length_dim" -type = "int" - -[[fields]] -name = "b_seq_length_dim" -type = "int" diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/batch_matmul_attrs.struct.toml new file mode 100644 index 0000000000..394dfb5fcc --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ops/batch_matmul_attrs.struct.toml @@ -0,0 +1,30 @@ +namespace = "FlexFlow" +name = "BatchMatmulAttrs" + +features = [ + "eq", + "ord", + "hash", + "json", + "rapidcheck", + "fmt", +] + +includes = [ + "utils/nonnegative_int/nonnegative_int.h", + "", +] + +src_includes = [ + "utils/fmt/optional.h", + "utils/json/optional.h", + "utils/rapidcheck/optional.h", +] + +[[fields]] +name = "a_seq_length_dim" +type = "std::optional<::FlexFlow::nonnegative_int>" + +[[fields]] +name = "b_seq_length_dim" +type = "std::optional<::FlexFlow::nonnegative_int>" diff --git a/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml index e7eeedec06..b3c574264c 100644 --- a/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml @@ -12,6 +12,7 @@ features = [ includes = [ "op-attrs/ff_dim_t.h", "op-attrs/ff_dim_t.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] @@ -20,4 +21,4 @@ type = "::FlexFlow::ff_dim_t" [[fields]] name = "combine_degree" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml index 77e8c51244..c4fb74ebd8 100644 --- a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml @@ -12,23 +12,24 @@ features = [ includes = [ "", "op-attrs/datatype.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] name = "num_samples" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "num_channels" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "height" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "width" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "datatype" diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml index 68cbd878d1..fdf0eaca78 100644 --- a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml @@ -12,6 +12,7 @@ features = [ includes = [ "op-attrs/shard_parallel_dim.dtg.h", "op-attrs/datatype.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] @@ -32,11 +33,11 @@ type = "::FlexFlow::ShardParallelDim" [[fields]] name = "sum_reduction_degree" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "discard_copy_reduction_degree" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "datatype" diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml index 5bef144cd9..8b86d42e04 100644 --- a/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml @@ -12,6 +12,7 @@ features = [ includes = [ "", "op-attrs/activation.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] src_includes = [ @@ -21,14 +22,14 @@ src_includes = [ ] fields = [ - { name = "out_channels", type = "int" }, - { name = "kernel_h", type = "int" }, - { name = "kernel_w", type = "int" }, - { name = "stride_h", type = "int" }, - { name = "stride_w", type = "int" }, - { name = "padding_h", type = "int" }, - { name = "padding_w", type = "int" }, - { name = "groups", type = "int" }, + { name = "out_channels", type = "::FlexFlow::nonnegative_int" }, + { name = "kernel_h", type = "::FlexFlow::nonnegative_int" }, + { name = "kernel_w", type = "::FlexFlow::nonnegative_int" }, + { name = "stride_h", type = "::FlexFlow::nonnegative_int" }, + { name = "stride_w", type = "::FlexFlow::nonnegative_int" }, + { name = "padding_h", type = "::FlexFlow::nonnegative_int" }, + { name = "padding_w", type = "::FlexFlow::nonnegative_int" }, + { name = "groups", type = "::FlexFlow::nonnegative_int" }, { name = "activation", type = "std::optional<::FlexFlow::Activation>" }, { name = "use_bias", type = "bool" }, ] diff --git a/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml index b8d15284e9..5a857efb3e 100644 --- a/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml @@ -10,9 +10,10 @@ features = [ ] includes = [ - "utils/stack_vector/stack_vector.h", "op-attrs/aggregate_op.dtg.h", "op-attrs/datatype.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", + "", ] src_includes = [ @@ -23,11 +24,11 @@ src_includes = [ [[fields]] name = "num_entries" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "out_channels" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "aggr" diff --git a/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml index 0a35a6c5ec..ffbe93c975 100644 --- a/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml @@ -14,6 +14,7 @@ includes = [ "op-attrs/activation.dtg.h", "op-attrs/regularizer_attrs.dtg.h", "", + "utils/nonnegative_int/nonnegative_int.h", ] src_includes = [ @@ -24,7 +25,7 @@ src_includes = [ [[fields]] name = "out_channels" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "use_bias" diff --git a/lib/op-attrs/include/op-attrs/ops/pool_2d.h b/lib/op-attrs/include/op-attrs/ops/pool_2d.h index 1af22ad022..af11d61f07 100644 --- a/lib/op-attrs/include/op-attrs/ops/pool_2d.h +++ b/lib/op-attrs/include/op-attrs/ops/pool_2d.h @@ -13,8 +13,8 @@ CHECK_VALID_OP_ATTR(Pool2DAttrs); tl::expected make_adaptive_pool2d_attrs(TensorDims const &input_dims, - int output_h, - int output_w, + nonnegative_int output_h, + nonnegative_int output_w, PoolOp pool_type, std::optional const &activation); diff --git a/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml index 20ca7deabc..fea318d46d 100644 --- a/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml @@ -13,6 +13,7 @@ includes = [ "op-attrs/pool_op.dtg.h", "op-attrs/activation.dtg.h", "", + "utils/nonnegative_int/nonnegative_int.h", ] src_includes = [ @@ -23,27 +24,27 @@ src_includes = [ [[fields]] name = "kernel_h" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "kernel_w" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "stride_h" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "stride_w" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "padding_h" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "padding_w" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "pool_type" diff --git a/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml index ee0ae54132..2798a85caf 100644 --- a/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml @@ -9,6 +9,10 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + [[fields]] name = "reduction_degree" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml index 69c4b7580f..965c40c05a 100644 --- a/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml @@ -12,6 +12,7 @@ features = [ includes = [ "op-attrs/ff_dim_t.h", "op-attrs/ff_dim_t.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] @@ -20,4 +21,4 @@ type = "::FlexFlow::ff_dim_t" [[fields]] name = "repartition_degree" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml index 4e43ea747a..58e365c0f2 100644 --- a/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml @@ -9,8 +9,10 @@ features = [ "fmt", ] -includes = [ ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] [[fields]] name = "replicate_degree" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml index fce827f5c2..7ce1ad7e34 100644 --- a/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml @@ -13,11 +13,12 @@ includes = [ "utils/stack_vector/stack_vector.h", "op-attrs/ff_dim_t.h", "op-attrs/ff_dim_t.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] name = "splits" -type = "::FlexFlow::stack_vector" +type = "::FlexFlow::stack_vector<::FlexFlow::nonnegative_int, MAX_NUM_OUTPUTS>" [[fields]] name = "axis" diff --git a/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml index 9ecbf1d725..1c5bfc8e10 100644 --- a/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml @@ -9,9 +9,13 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + [[fields]] name = "k" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "sorted" diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml index 974b27d2a7..be3a95eec8 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml @@ -13,6 +13,7 @@ includes = [ "op-attrs/parallel_tensor_shape/sum_degree.dtg.h", "op-attrs/parallel_tensor_shape/discard_copy_degree.dtg.h", "op-attrs/dim_ordered/dim_ordered.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] @@ -25,4 +26,4 @@ type = "::FlexFlow::DiscardCopyDegree" [[fields]] name = "shard_degrees" -type = "::FlexFlow::FFOrdered" +type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>" diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h index 6b88a7bda1..67864e637b 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h @@ -9,27 +9,27 @@ namespace FlexFlow { FFOrdered ff_ordered_shard_dims(ParallelTensorDims const &); -FFOrdered ff_ordered_shard_degrees(ParallelTensorDims const &); +FFOrdered ff_ordered_shard_degrees(ParallelTensorDims const &); std::unordered_set replica_dims(ParallelTensorDims const &); /* size_t get_volume(ParallelTensorDims const &); */ -size_t num_shard_dims(ParallelTensorDims const &); +nonnegative_int num_shard_dims(ParallelTensorDims const &); ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorDims const &); ParallelTensorDims lift_to_parallel(TensorDims const &); -ParallelTensorDims - lift_to_parallel_with_degrees(TensorDims const &, - SumDegree const &, - DiscardCopyDegree const &, - FFOrdered const &shard_degrees); +ParallelTensorDims lift_to_parallel_with_degrees( + TensorDims const &, + SumDegree const &, + DiscardCopyDegree const &, + FFOrdered const &shard_degrees); ParallelTensorDims lift_to_parallel_with_degrees(TensorDims const &, ParallelTensorDimDegrees const &); -int total_replica_degree(ParallelTensorDims const &); -int total_shard_degree(ParallelTensorDims const &); -int total_parallel_degree(ParallelTensorDims const &); +nonnegative_int total_replica_degree(ParallelTensorDims const &); +nonnegative_int total_shard_degree(ParallelTensorDims const &); +nonnegative_int total_parallel_degree(ParallelTensorDims const &); ShardParallelDim shard_dim_at_idx(ParallelTensorDims const &, relative_ff_dim_t); diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h index 0339b9b8a6..d461ffc9e4 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h @@ -12,12 +12,13 @@ namespace FlexFlow { -int num_shard_dims(ParallelTensorShape const &); +nonnegative_int num_shard_dims(ParallelTensorShape const &); ShardParallelDim shard_dim_at_idx(ParallelTensorShape const &, relative_ff_dim_t); ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &, relative_ff_dim_t); -FFOrdered ff_ordered_shard_degrees(ParallelTensorShape const &); +FFOrdered + ff_ordered_shard_degrees(ParallelTensorShape const &); std::optional try_get_shard_dim_at_idx(ParallelTensorShape const &, relative_ff_dim_t); @@ -25,11 +26,11 @@ std::optional ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorShape const &); ParallelTensorShape lift_to_parallel(TensorShape const &); -ParallelTensorShape - lift_to_parallel_with_degrees(TensorShape const &, - SumDegree const &, - DiscardCopyDegree const &, - FFOrdered const &shard_degrees); +ParallelTensorShape lift_to_parallel_with_degrees( + TensorShape const &, + SumDegree const &, + DiscardCopyDegree const &, + FFOrdered const &shard_degrees); ParallelTensorShape lift_to_parallel_with_degrees(TensorShape const &, ParallelTensorDimDegrees const &); @@ -37,13 +38,13 @@ ParallelTensorShape std::unordered_set replica_dims(ParallelTensorShape const &); TensorShape get_piece_shape(ParallelTensorShape const &); -int get_num_replica_dims(ParallelTensorShape const &); -int get_num_replicas(ParallelTensorShape const &); +nonnegative_int get_num_replica_dims(ParallelTensorShape const &); +nonnegative_int get_num_replicas(ParallelTensorShape const &); -int get_sum_degree(ParallelTensorShape const &); -int get_discard_copy_degree(ParallelTensorShape const &); +nonnegative_int get_sum_degree(ParallelTensorShape const &); +nonnegative_int get_discard_copy_degree(ParallelTensorShape const &); -int get_total_parallel_degree(ParallelTensorShape const &); +nonnegative_int get_total_parallel_degree(ParallelTensorShape const &); bool is_valid(ParallelTensorShape const &); diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml index b4905fb0ce..76b52bcdef 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml @@ -9,6 +9,10 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + [[fields]] name = "value" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml index d86917211e..550a384ba9 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml @@ -9,6 +9,10 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + [[fields]] name = "value" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/op-attrs/include/op-attrs/pcg_operator_attrs.variant.toml b/lib/op-attrs/include/op-attrs/pcg_operator_attrs.variant.toml index a44d712dbf..fdd11ac11f 100644 --- a/lib/op-attrs/include/op-attrs/pcg_operator_attrs.variant.toml +++ b/lib/op-attrs/include/op-attrs/pcg_operator_attrs.variant.toml @@ -11,7 +11,7 @@ features = [ includes = [ "op-attrs/ops/attention_attrs.dtg.h", - "op-attrs/ops/batch_matmul.dtg.h", + "op-attrs/ops/batch_matmul_attrs.dtg.h", "op-attrs/ops/batch_norm_attrs.dtg.h", "op-attrs/ops/broadcast_attrs.dtg.h", "op-attrs/ops/cast_attrs.dtg.h", diff --git a/lib/op-attrs/include/op-attrs/relative_ff_dim_t.h b/lib/op-attrs/include/op-attrs/relative_ff_dim_t.h index af51cc69be..5205b1ead8 100644 --- a/lib/op-attrs/include/op-attrs/relative_ff_dim_t.h +++ b/lib/op-attrs/include/op-attrs/relative_ff_dim_t.h @@ -7,7 +7,7 @@ namespace FlexFlow { ff_dim_t ff_dim_t_from_relative_ff_dim_t(relative_ff_dim_t ff_dim, - int input_dim); + nonnegative_int input_dim); } // namespace FlexFlow namespace rc { diff --git a/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml b/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml index 2ad442aa22..5ca486181e 100644 --- a/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml +++ b/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml @@ -11,11 +11,12 @@ features = [ includes = [ "op-attrs/replica_type.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] name = "degree" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "replica_type" diff --git a/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h b/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h index 74a8df339b..92d2b0abb2 100644 --- a/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h +++ b/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h @@ -8,7 +8,8 @@ namespace FlexFlow { ReplicaParallelDimSet empty_replica_parallel_dim_set(); -int get_degree_of_replica_type(ReplicaParallelDimSet const &, ReplicaType); +nonnegative_int get_degree_of_replica_type(ReplicaParallelDimSet const &, + ReplicaType); std::unordered_set get_replica_dims(ReplicaParallelDimSet const &); bool is_valid(ReplicaParallelDimSet const &); diff --git a/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml b/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml index 21c81396d1..5c5d2dc5b2 100644 --- a/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml +++ b/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml @@ -9,10 +9,14 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + [[fields]] name = "size" -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "degree" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.h b/lib/op-attrs/include/op-attrs/tensor_dims.h index 5e1503360b..bf11f36e51 100644 --- a/lib/op-attrs/include/op-attrs/tensor_dims.h +++ b/lib/op-attrs/include/op-attrs/tensor_dims.h @@ -6,11 +6,11 @@ namespace FlexFlow { -FFOrdered const &ff_ordered(TensorDims const &); +FFOrdered const &ff_ordered(TensorDims const &); -size_t num_dims(TensorDims const &); -size_t dim_at_idx(TensorDims const &, relative_ff_dim_t); -size_t &dim_at_idx(TensorDims &, relative_ff_dim_t); +nonnegative_int num_dims(TensorDims const &); +nonnegative_int dim_at_idx(TensorDims const &, relative_ff_dim_t); +nonnegative_int &dim_at_idx(TensorDims &, relative_ff_dim_t); bool tensor_dims_is_broadcastable_to(TensorDims const &curr, TensorDims const &goal); diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml index b262dd32b6..e86b866fd6 100644 --- a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml +++ b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml @@ -8,10 +8,12 @@ features = [ "rapidcheck", "fmt", ] + includes = [ "op-attrs/dim_ordered/dim_ordered.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] name = "ff_ordered" -type = "::FlexFlow::FFOrdered" +type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>" diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h index b8733cddbe..15958a1daf 100644 --- a/lib/op-attrs/include/op-attrs/tensor_shape.h +++ b/lib/op-attrs/include/op-attrs/tensor_shape.h @@ -5,11 +5,11 @@ namespace FlexFlow { -size_t num_dims(TensorShape const &); -size_t dim_at_idx(TensorShape const &, relative_ff_dim_t); -size_t &dim_at_idx(TensorShape &, relative_ff_dim_t); -size_t get_num_elements(TensorShape const &); -size_t get_size_in_bytes(TensorShape const &); +nonnegative_int num_dims(TensorShape const &); +nonnegative_int dim_at_idx(TensorShape const &, relative_ff_dim_t); +nonnegative_int &dim_at_idx(TensorShape &, relative_ff_dim_t); +nonnegative_int get_num_elements(TensorShape const &); +nonnegative_int get_size_in_bytes(TensorShape const &); } // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/datatype.cc b/lib/op-attrs/src/op-attrs/datatype.cc index 3bee05c253..9bb3b34390 100644 --- a/lib/op-attrs/src/op-attrs/datatype.cc +++ b/lib/op-attrs/src/op-attrs/datatype.cc @@ -1,23 +1,24 @@ #include "op-attrs/datatype.h" #include "utils/containers/contains.h" #include "utils/exception.h" +#include "utils/nonnegative_int/nonnegative_int.h" namespace FlexFlow { -size_t size_of_datatype(DataType data_type) { +nonnegative_int size_of_datatype(DataType data_type) { switch (data_type) { case DataType::BOOL: - return sizeof(bool); + return nonnegative_int{sizeof(bool)}; case DataType::INT32: - return sizeof(int32_t); + return nonnegative_int{sizeof(int32_t)}; case DataType::INT64: - return sizeof(int64_t); + return nonnegative_int{sizeof(int64_t)}; case DataType::HALF: - return sizeof(float) / 2; + return nonnegative_int{sizeof(float)} / 2_n; case DataType::FLOAT: - return sizeof(float); + return nonnegative_int{sizeof(float)}; case DataType::DOUBLE: - return sizeof(double); + return nonnegative_int{sizeof(double)}; default: throw mk_runtime_error(fmt::format("Unknown DataType {}", data_type)); } diff --git a/lib/op-attrs/src/op-attrs/ff_dim_t.cc b/lib/op-attrs/src/op-attrs/ff_dim_t.cc index 0a99e39a91..44672fc391 100644 --- a/lib/op-attrs/src/op-attrs/ff_dim_t.cc +++ b/lib/op-attrs/src/op-attrs/ff_dim_t.cc @@ -2,7 +2,7 @@ namespace FlexFlow { relative_ff_dim_t relative_ff_dim_t_from_ff_dim_t(ff_dim_t ff_dim) { - return relative_ff_dim_t{ff_dim.value.get_value()}; + return relative_ff_dim_t{ff_dim.value.unwrap_nonnegative()}; } } // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ops/attention.cc b/lib/op-attrs/src/op-attrs/ops/attention.cc index 57c7105534..10fbf412f7 100644 --- a/lib/op-attrs/src/op-attrs/ops/attention.cc +++ b/lib/op-attrs/src/op-attrs/ops/attention.cc @@ -16,79 +16,82 @@ namespace FlexFlow { /* return is_valid; */ /* } */ -int get_qProjSize(MultiHeadAttentionAttrs const &attrs) { +nonnegative_int get_qProjSize(MultiHeadAttentionAttrs const &attrs) { return attrs.kdim; } -int get_vProjSize(MultiHeadAttentionAttrs const &attrs) { +nonnegative_int get_vProjSize(MultiHeadAttentionAttrs const &attrs) { return attrs.vdim; } -int get_kProjSize(MultiHeadAttentionAttrs const &attrs) { +nonnegative_int get_kProjSize(MultiHeadAttentionAttrs const &attrs) { return attrs.kdim; } -int get_oProjSize(MultiHeadAttentionAttrs const &attrs) { +nonnegative_int get_oProjSize(MultiHeadAttentionAttrs const &attrs) { return attrs.embed_dim; } -int get_qSize(TensorShape const &query_shape) { +nonnegative_int get_qSize(TensorShape const &query_shape) { return dim_at_idx(query_shape, relative_ff_dim_t{0}); } -int get_kSize(TensorShape const &key_shape) { +nonnegative_int get_kSize(TensorShape const &key_shape) { return dim_at_idx(key_shape, relative_ff_dim_t{0}); } -int get_vSize(TensorShape const &value_shape) { +nonnegative_int get_vSize(TensorShape const &value_shape) { return dim_at_idx(value_shape, relative_ff_dim_t{0}); } -int get_qSize(MultiHeadAttentionParallelInputs const &inputs) { +nonnegative_int get_qSize(MultiHeadAttentionParallelInputs const &inputs) { return inputs.query_dim.size; } -int get_qSize(MultiHeadAttentionInputs const &inputs) { +nonnegative_int get_qSize(MultiHeadAttentionInputs const &inputs) { return inputs.query_size; } -int get_kSize(MultiHeadAttentionParallelInputs const &inputs) { +nonnegative_int get_kSize(MultiHeadAttentionParallelInputs const &inputs) { return inputs.key_dim.size; } -int get_kSize(MultiHeadAttentionInputs const &inputs) { +nonnegative_int get_kSize(MultiHeadAttentionInputs const &inputs) { return inputs.key_size; } -int get_vSize(MultiHeadAttentionParallelInputs const &inputs) { +nonnegative_int get_vSize(MultiHeadAttentionParallelInputs const &inputs) { return inputs.value_dim.size; } -int get_vSize(MultiHeadAttentionInputs const &inputs) { +nonnegative_int get_vSize(MultiHeadAttentionInputs const &inputs) { return inputs.value_size; } -int get_kvSeqLength(MultiHeadAttentionParallelInputs const &inputs) { +nonnegative_int + get_kvSeqLength(MultiHeadAttentionParallelInputs const &inputs) { return inputs.sequence_dim.size; } -int get_kvSeqLength(MultiHeadAttentionInputs const &inputs) { +nonnegative_int get_kvSeqLength(MultiHeadAttentionInputs const &inputs) { return inputs.sequence_length; } -int get_qoSeqLength(MultiHeadAttentionParallelInputs const &inputs) { +nonnegative_int + get_qoSeqLength(MultiHeadAttentionParallelInputs const &inputs) { return inputs.sequence_dim.size; // FIXME -- assumes only prefill } -int get_qoSeqLength(MultiHeadAttentionInputs const &inputs) { +nonnegative_int get_qoSeqLength(MultiHeadAttentionInputs const &inputs) { return inputs.sequence_length; // FIXME -- assumes only prefil } -int get_num_samples(MultiHeadAttentionParallelInputs const &inputs) { +nonnegative_int + get_num_samples(MultiHeadAttentionParallelInputs const &inputs) { return inputs.batch_dim.size; } -int get_num_samples(MultiHeadAttentionInputs const &inputs) { +nonnegative_int get_num_samples(MultiHeadAttentionInputs const &inputs) { return inputs.batch_size; } @@ -124,10 +127,10 @@ tl::expected MultiHeadAttentionInputs parsed = parse_result.value(); return TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ parsed.batch_size, parsed.sequence_length, - size_t_from_int(attrs.embed_dim), + attrs.embed_dim, }}, parsed.datatype, }; @@ -147,23 +150,23 @@ tl::expected MultiHeadAttentionInputs parsed = parse_result.value(); // W^Q_i in "Attention Is All You Need" top of page 5 - size_t qProjectWeightSize = parsed.query_size * attrs.kdim; + nonnegative_int qProjectWeightSize = parsed.query_size * attrs.kdim; // W^K_i in "Attention Is All You Need" top of page 5 (all i's put together) - size_t kProjectWeightSize = parsed.key_size * attrs.kdim; + nonnegative_int kProjectWeightSize = parsed.key_size * attrs.kdim; // W^V_i in "Attention Is All You Need" top of page 5 (all i's put together) - size_t vProjectWeightSize = parsed.value_size * attrs.vdim; + nonnegative_int vProjectWeightSize = parsed.value_size * attrs.vdim; // W^O in "Attention Is All You Need" top of page 5, with num_heads factored // out - size_t outWeightSize = attrs.vdim * attrs.embed_dim; + nonnegative_int outWeightSize = attrs.vdim * attrs.embed_dim; return TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ (qProjectWeightSize + kProjectWeightSize + vProjectWeightSize + outWeightSize), - size_t_from_int(attrs.num_heads), + attrs.num_heads, }}, parsed.datatype, }; @@ -184,8 +187,8 @@ tl::expected }); return TensorShape{ - TensorDims{FFOrdered{ - size_t_from_int(attrs.kdim + attrs.kdim + attrs.vdim), + TensorDims{FFOrdered{ + attrs.kdim + attrs.kdim + attrs.vdim, }}, parsed.datatype, }; @@ -206,8 +209,8 @@ tl::expected }); return TensorShape{ - TensorDims{FFOrdered{ - size_t_from_int(attrs.embed_dim), + TensorDims{FFOrdered{ + attrs.embed_dim, }}, parsed.datatype, }; @@ -235,14 +238,14 @@ tl::expected } TensorShape unpar_shape = result_unpar_get_shape.value(); - int joined_dim_degree = 1; - int head_dim_degree = parsed.discard_copy_degree.value; + nonnegative_int joined_dim_degree = 1_n; + nonnegative_int head_dim_degree = parsed.discard_copy_degree.value; return lift_to_parallel_with_degrees( unpar_shape, - SumDegree{1}, + SumDegree{1_n}, DiscardCopyDegree{parsed.batch_dim.degree}, - FFOrdered{joined_dim_degree, head_dim_degree}); + FFOrdered{joined_dim_degree, head_dim_degree}); } tl::expected @@ -273,10 +276,10 @@ tl::expected result_unpar.value(); }); - SumDegree sum_degree = SumDegree{1}; + SumDegree sum_degree = SumDegree{1_n}; DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{ parsed.batch_dim.degree * parsed.discard_copy_degree.value}; - FFOrdered shard_degrees = FFOrdered{1}; + FFOrdered shard_degrees = FFOrdered{1_n}; return lift_to_parallel_with_degrees( unpar_shape, sum_degree, discard_copy_degree, shard_degrees); } @@ -309,10 +312,10 @@ tl::expected result_unpar.value(); }); - SumDegree sum_degree = SumDegree{1}; + SumDegree sum_degree = SumDegree{1_n}; DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{ parsed.batch_dim.degree * parsed.discard_copy_degree.value}; - FFOrdered shard_degrees = FFOrdered{1}; + FFOrdered shard_degrees = FFOrdered{1_n}; return lift_to_parallel_with_degrees( unpar_shape, sum_degree, discard_copy_degree, shard_degrees); } @@ -339,402 +342,25 @@ tl::expected } TensorShape unpar_shape = result_unpar_get_shape.value(); - int sum_degree = parsed.discard_copy_degree.value; - int discard_copy_degree = 1; - int batch_degree = parsed.batch_dim.degree; - int seq_len_degree = 1; - int out_dim_degree = 1; + nonnegative_int sum_degree = parsed.discard_copy_degree.value; + nonnegative_int discard_copy_degree = 1_n; + nonnegative_int batch_degree = parsed.batch_dim.degree; + nonnegative_int seq_len_degree = 1_n; + nonnegative_int out_dim_degree = 1_n; return lift_to_parallel_with_degrees( unpar_shape, SumDegree{sum_degree}, DiscardCopyDegree{discard_copy_degree}, - FFOrdered{batch_degree, seq_len_degree, out_dim_degree}); + FFOrdered{batch_degree, seq_len_degree, out_dim_degree}); } -int get_oSize(ParallelTensorShape const &) { +nonnegative_int get_oSize(ParallelTensorShape const &) { NOT_IMPLEMENTED(); } -int get_oSize(TensorShape const &) { +nonnegative_int get_oSize(TensorShape const &) { NOT_IMPLEMENTED(); } } // namespace FlexFlow - -// Tensor FFModel::multihead_attention(const Tensor query, -// const Tensor key, -// const Tensor value, -// int embed_dim, -// int num_heads, -// int kdim, -// int vdim, -// float dropout, -// bool bias, -// bool add_bias_kv, -// bool add_zero_attn, -// Initializer *kernel_initializer, -// char const *name) { -// Layer *li = new Layer(this, -// OP_MULTIHEAD_ATTENTION, -// DT_FLOAT, -// name, -// 3 /*inputs*/, -// 1 /*weights*/, -// 1 /*outputs*/, -// query, -// key, -// value); -// { -// int numdims = query->num_dims; -// int dims[MAX_TENSOR_DIM]; -// for (int i = 0; i < numdims; i++) { -// dims[i] = query->dims[i]; -// } -// dims[0] = embed_dim; -// li->outputs[0] = create_tensor_legion_ordering( -// numdims, dims, DT_FLOAT, li, 0, true /*create_grad*/); -// } -// { -// // Compute weight size -// int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, -// oProjSize = embed_dim; -// int qSize = query->dims[0], kSize = key->dims[0], vSize = value->dims[0]; -// int qParas = qProjSize * qSize; -// int kParas = kProjSize * kSize; -// int vParas = vProjSize * vSize; -// int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); -// int dims[2] = {qParas + kParas + vParas + oParas, num_heads}; -// li->weights[0] = create_weight_legion_ordering(2, -// dims, -// DT_FLOAT, -// li, -// true /*create_grad*/, -// kernel_initializer, -// CHOSEN_SYNC_TYPE); -// } -// li->data_type = DT_FLOAT; -// li->add_int_property("embed_dim", embed_dim); -// li->add_int_property("num_heads", num_heads); -// li->add_int_property("kdim", kdim); -// li->add_int_property("vdim", vdim); -// li->add_int_property("bias", bias); -// li->add_int_property("add_bias_kv", add_bias_kv); -// li->add_int_property("add_zero_attn", add_zero_attn); -// li->add_float_property("dropout", dropout); -// layers.push_back(li); -// return li->outputs[0]; -// } - -// MultiHeadAttention::MultiHeadAttention(FFModel &model, -// LayerID const &_layer_guid, -// const ParallelTensor _query, -// const ParallelTensor _key, -// const ParallelTensor _value, -// int _embed_dim, -// int _num_heads, -// int _kdim, -// int _vdim, -// float _dropout, -// bool _bias, -// bool _add_bias_kv, -// bool _add_zero_attn, -// bool allocate_weights, -// char const *name) -// // Initializer* _bias_initializer) -// : Op(model, -// OP_MULTIHEAD_ATTENTION, -// DT_FLOAT, -// name, -// 3 /*inputs*/, -// 1 /*weights*/, -// 1 /*outputs*/, -// _query, -// _key, -// _value), -// attrs(_embed_dim, -// _num_heads, -// _kdim, -// _vdim, -// _dropout, -// _bias, -// _add_bias_kv, -// _add_zero_attn), -// qSize(_query->dims[0].size), kSize(_key->dims[0].size), -// vSize(_value->dims[0].size), qProjSize(_kdim), -// qoSeqLength(_query->dims[1].size), kvSeqLength(_key->dims[1].size) { -// // overwrite layer_guid -// layer_guid = _layer_guid; - -// // assert key and value have the same sequence length -// assert(_key->dims[1] == _value->dims[1]); -// numOutputs = 1; -// int numdim = _query->num_dims; -// ParallelDim dims[MAX_TENSOR_DIM]; -// for (int i = 0; i < numdim; i++) { -// dims[i] = _query->dims[i]; -// } -// dims[0].size = _embed_dim; -// // Currently require no parallelism along this dim -// assert(dims[0].degree == 1); -// if (allocate_weights) { -// // Create weight tensor -// int num_dims = inputs[0]->num_dims; -// // Compute weight size -// int qParas = this->qProjSize * this->qSize; -// int kParas = kProjSize(attrs) * this->kSize; -// int vParas = vProjSize(attrs) * this->vSize; -// int oParas = oProjSize(attrs) * -// (vProjSize(attrs) > 0 ? vProjSize(attrs) : this->vSize); -// ParallelDim dims[3]; -// dims[0] = inputs[0]->dims[num_dims - 2]; -// dims[0].size = dims[0].degree; -// dims[1] = inputs[0]->dims[num_dims - 1]; -// dims[1].size = this->attrs.num_heads; -// dims[2].size = qParas + kParas + vParas + oParas; -// dims[2].degree = 1; -// dims[2].parallel_idx = -1; -// int seed = std::rand(); -// Initializer *initializer = new GlorotUniform(seed); -// #ifdef USE_NCCL -// ParameterSyncType comm_type = ParameterSyncType::NCCL; -// #else -// ParameterSyncType comm_type = ParameterSyncType::PS; -// #endif -// weights[0] = model.create_parallel_weight<3>(dims, -// DT_FLOAT, -// NULL /*owner_op*/, -// true /*create_grad*/, -// initializer, -// comm_type); -// } - -// outputs[0] = model.create_parallel_tensor_legion_ordering( -// _query->num_dims, dims, DT_FLOAT, this); -// /* for (int i = 0; i < numdim; i++) { */ -// /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ -// /* } */ -// /* // Check correctness */ -// /* assert(check_output_input_weight_parallel_dims()); */ -// } - -// MultiHeadAttention::MultiHeadAttention(FFModel &model, -// const ParallelTensor _query, -// const ParallelTensor _key, -// const ParallelTensor _value, -// const ParallelTensor _weight, -// int _embed_dim, -// int _num_heads, -// int _kdim, -// int _vdim, -// float _dropout, -// bool _bias, -// bool _add_bias_kv, -// bool _add_zero_attn, -// bool allocate_weights, -// char const *name) -// // Initializer* _bias_initializer) -// : Op(model, -// OP_MULTIHEAD_ATTENTION, -// DT_FLOAT, -// name, -// 3 /*inputs*/, -// 1 /*weights*/, -// 1 /*outputs*/, -// _query, -// _key, -// _value, -// _weight), -// attrs(_embed_dim, -// _num_heads, -// _kdim, -// _vdim, -// _dropout, -// _bias, -// _add_bias_kv, -// _add_zero_attn), -// qSize(_query->dims[0].size), kSize(_key->dims[0].size), -// vSize(_value->dims[0].size), qProjSize(_kdim), -// qoSeqLength(_query->dims[1].size), kvSeqLength(_key->dims[1].size) -// // bias_initializer(_bias_initializer) -// { -// // assert key and value have the same sequence length -// assert(_key->dims[1] == _value->dims[1]); -// numOutputs = 1; -// int numdim = _query->num_dims; -// ParallelDim dims[MAX_TENSOR_DIM]; -// for (int i = 0; i < numdim; i++) { -// dims[i] = _query->dims[i]; -// } -// // assert key and value have the same sequence length -// assert(_key->dims[1] == _value->dims[1]); -// dims[0].size = _embed_dim; -// // Currently require no parallelism along this dim -// assert(dims[0].degree == 1); -// if (allocate_weights) { -// // Create weight tensor -// int num_dims = inputs[0]->num_dims; -// // Compute weight size -// int qParas = this->qProjSize * this->qSize; -// int kParas = kProjSize(attrs) * this->kSize; -// int vParas = vProjSize(attrs) * this->vSize; -// int oParas = oProjSize(attrs) * -// (vProjSize(attrs) > 0 ? vProjSize(attrs) : this->vSize); -// ParallelDim dims[3]; -// dims[0] = inputs[0]->dims[num_dims - 2]; -// dims[0].size = dims[0].degree; -// dims[1] = inputs[0]->dims[num_dims - 1]; -// dims[1].size = this->attrs.num_heads; -// dims[2].size = qParas + kParas + vParas + oParas; -// int seed = std::rand(); -// Initializer *initializer = new GlorotUniform(seed); -// #ifdef USE_NCCL -// ParameterSyncType comm_type = ParameterSyncType::NCCL; -// #else -// ParameterSyncType comm_type = ParameterSyncType::PS; -// #endif -// weights[0] = model.create_parallel_weight<3>(dims, -// DT_FLOAT, -// NULL /*owner_op*/, -// true /*create_grad*/, -// initializer, -// comm_type); -// } -// outputs[0] = model.create_parallel_tensor_legion_ordering( -// _query->num_dims, dims, DT_FLOAT, this); - -// /* for (int i = 0; i < numdim; i++) { */ -// /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ -// /* } */ -// /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); -// */ -// /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); -// */ -// // Check correctness -// /* assert(check_output_input_weight_parallel_dims()); */ -// } - -// void MultiHeadAttention::forward(FFModel const &ff) { -// ArgumentMap argmap; -// Context ctx = ff.config.lg_ctx; -// Runtime *runtime = ff.config.lg_hlr; -// set_argumentmap_for_forward(ff, argmap); -// int idx = 0; -// IndexLauncher launcher(ATTENTION_FWD_TASK_ID, -// parallel_is, -// TaskArgument(NULL, 0), -// argmap, -// Predicate::TRUE_PRED, -// false /*must*/, -// 0 /*mapper_id*/, -// outputs[0]->machine_view.hash()); -// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// inputs[0]->region)); -// launcher.add_field(idx++, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(inputs[1]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// inputs[1]->region)); -// launcher.add_field(idx++, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(inputs[2]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// inputs[2]->region)); -// launcher.add_field(idx++, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(weights[0]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// weights[0]->region)); -// launcher.add_field(idx++, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(outputs[0]->part, -// 0 /*projection id*/, -// WRITE_ONLY, -// EXCLUSIVE, -// outputs[0]->region)); -// launcher.add_field(4, FID_DATA); -// runtime->execute_index_space(ctx, launcher); -// } - -// void MultiHeadAttention::backward(FFModel const &ff) { -// ArgumentMap argmap; -// Context ctx = ff.config.lg_ctx; -// Runtime *runtime = ff.config.lg_hlr; -// set_argumentmap_for_backward(ff, argmap); -// IndexLauncher launcher(ATTENTION_BWD_TASK_ID, -// parallel_is, -// TaskArgument(NULL, 0), -// argmap, -// Predicate::TRUE_PRED, -// false /*must*/, -// 0 /*mapper_id*/, -// outputs[0]->machine_view.hash()); -// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// inputs[0]->region)); -// launcher.add_field(0, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(inputs[1]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// inputs[1]->region)); -// launcher.add_field(1, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(inputs[2]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// inputs[2]->region)); -// launcher.add_field(2, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(weights[0]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// weights[0]->region)); -// launcher.add_field(3, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// outputs[0]->region_grad)); -// launcher.add_field(4, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, -// 0 /*projection id*/, -// READ_WRITE, -// EXCLUSIVE, -// weights[0]->region_grad)); -// launcher.add_field(5, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, -// 0 /*projection id*/, -// READ_WRITE, -// EXCLUSIVE, -// inputs[0]->region_grad)); -// launcher.add_field(6, FID_DATA); -// int num_regions = 7; -// if (inputs[1]->region != inputs[0]->region) { -// // when key != query -// launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, -// 0 /*projection id*/, -// READ_WRITE, -// EXCLUSIVE, -// inputs[1]->region_grad)); -// launcher.add_field(num_regions++, FID_DATA); -// } -// if ((inputs[2]->region != inputs[0]->region) && -// (inputs[2]->region != inputs[1]->region)) { -// // when value != key and value != query -// launcher.add_region_requirement(RegionRequirement(inputs[2]->part_grad, -// 0 /*projection id*/, -// READ_WRITE, -// EXCLUSIVE, -// inputs[2]->region_grad)); -// launcher.add_field(num_regions++, FID_DATA); -// } -// runtime->execute_index_space(ctx, launcher); -// } diff --git a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc index 97544d1750..b9049bf461 100644 --- a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc +++ b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc @@ -31,9 +31,9 @@ tl::expected 3)); } - size_t seq_len_q = dim_at_idx(input_q, relative_ff_dim_t{-2}); - size_t seq_len_k = dim_at_idx(input_k, relative_ff_dim_t{-2}); - size_t seq_len_v = dim_at_idx(input_v, relative_ff_dim_t{-2}); + nonnegative_int seq_len_q = dim_at_idx(input_q, relative_ff_dim_t{-2}); + nonnegative_int seq_len_k = dim_at_idx(input_k, relative_ff_dim_t{-2}); + nonnegative_int seq_len_v = dim_at_idx(input_v, relative_ff_dim_t{-2}); if (!all_same(seq_len_q, seq_len_k, seq_len_v)) { return tl::unexpected(fmt::format( @@ -43,9 +43,9 @@ tl::expected seq_len_v)); } - size_t batch_size_q = dim_at_idx(input_q, relative_ff_dim_t{-3}); - size_t batch_size_k = dim_at_idx(input_k, relative_ff_dim_t{-3}); - size_t batch_size_v = dim_at_idx(input_v, relative_ff_dim_t{-3}); + nonnegative_int batch_size_q = dim_at_idx(input_q, relative_ff_dim_t{-3}); + nonnegative_int batch_size_k = dim_at_idx(input_k, relative_ff_dim_t{-3}); + nonnegative_int batch_size_v = dim_at_idx(input_v, relative_ff_dim_t{-3}); if (!all_same(batch_size_q, batch_size_k, batch_size_v)) { return tl::unexpected(fmt::format( @@ -63,9 +63,9 @@ tl::expected input_v.data_type)); } - size_t q_size = dim_at_idx(input_q, relative_ff_dim_t{-1}); - size_t k_size = dim_at_idx(input_k, relative_ff_dim_t{-1}); - size_t v_size = dim_at_idx(input_v, relative_ff_dim_t{-1}); + nonnegative_int q_size = dim_at_idx(input_q, relative_ff_dim_t{-1}); + nonnegative_int k_size = dim_at_idx(input_k, relative_ff_dim_t{-1}); + nonnegative_int v_size = dim_at_idx(input_v, relative_ff_dim_t{-1}); return MultiHeadAttentionInputs{ batch_size_q, diff --git a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc index 3bd0825555..d69b62b759 100644 --- a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc +++ b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc @@ -107,9 +107,9 @@ tl::expected value_dim.degree)); } - int discard_copy_q = get_discard_copy_degree(input_q); - int discard_copy_k = get_discard_copy_degree(input_k); - int discard_copy_v = get_discard_copy_degree(input_v); + nonnegative_int discard_copy_q = get_discard_copy_degree(input_q); + nonnegative_int discard_copy_k = get_discard_copy_degree(input_k); + nonnegative_int discard_copy_v = get_discard_copy_degree(input_v); if (!all_same(discard_copy_q, discard_copy_k, discard_copy_v)) { return tl::unexpected(fmt::format("Q, K, V disagree on the discard-copy " diff --git a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc index 71118db7a6..d32ae33d14 100644 --- a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc +++ b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc @@ -57,13 +57,13 @@ tl::expected input_rhs.data_type)); } - size_t lhs_b = dim_at_idx(input_lhs, relative_ff_dim_t{0}); - size_t n = dim_at_idx(input_lhs, relative_ff_dim_t{1}); - size_t lhs_m = dim_at_idx(input_lhs, relative_ff_dim_t{2}); + nonnegative_int lhs_b = dim_at_idx(input_lhs, relative_ff_dim_t{0}); + nonnegative_int n = dim_at_idx(input_lhs, relative_ff_dim_t{1}); + nonnegative_int lhs_m = dim_at_idx(input_lhs, relative_ff_dim_t{2}); - size_t rhs_b = dim_at_idx(input_rhs, relative_ff_dim_t{0}); - size_t rhs_m = dim_at_idx(input_rhs, relative_ff_dim_t{1}); - size_t p = dim_at_idx(input_rhs, relative_ff_dim_t{2}); + nonnegative_int rhs_b = dim_at_idx(input_rhs, relative_ff_dim_t{0}); + nonnegative_int rhs_m = dim_at_idx(input_rhs, relative_ff_dim_t{1}); + nonnegative_int p = dim_at_idx(input_rhs, relative_ff_dim_t{2}); if (lhs_b != rhs_b) { return tl::unexpected( @@ -76,7 +76,7 @@ tl::expected return TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ lhs_b, n, p, @@ -151,9 +151,10 @@ tl::expected ShardParallelDim output_n = n; ShardParallelDim output_p = p; - int output_discard_copy_degree = 1; - int output_sum_degree = get_total_parallel_degree(input_lhs) / - (output_b.degree * output_n.degree * output_p.degree); + nonnegative_int output_discard_copy_degree = 1_n; + nonnegative_int output_sum_degree = + get_total_parallel_degree(input_lhs) / + (output_b.degree * output_n.degree * output_p.degree); ParallelTensorShape result = ParallelTensorShape{ ParallelTensorDims{ diff --git a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc index 472e5f1a25..ed58fe5189 100644 --- a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc +++ b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc @@ -67,10 +67,10 @@ tl::expected return tl::unexpected("No gamma weights exist for attrs.affine = false"); } - size_t num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1}); + nonnegative_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1}); return TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ num_channels, }}, DataType::FLOAT, @@ -97,26 +97,23 @@ static std::optional input_degrees); } - if (input_degrees.sum_degree != SumDegree{1}) { + if (input_degrees.sum_degree != SumDegree{1_n}) { return fmt::format("Expected sum degree 1, but receieved sum degree {}", input_degrees.sum_degree); } - if (input_degrees.discard_copy_degree != DiscardCopyDegree{1}) { + if (input_degrees.discard_copy_degree != DiscardCopyDegree{1_n}) { return fmt::format( "Expected discard copy degree 1, but receieved discard copy degree {}", input_degrees.discard_copy_degree); } - FFOrdered non_channel_degrees = - concat(slice(input_degrees.shard_degrees, - ff_dim_t{nonnegative_int{0}}, - ff_dim_t{nonnegative_int{1}}), - slice(input_degrees.shard_degrees, - ff_dim_t{nonnegative_int{2}}, - std::nullopt)); + FFOrdered non_channel_degrees = + concat(slice(input_degrees.shard_degrees, ff_dim_t{0_n}, ff_dim_t{1_n}), + slice(input_degrees.shard_degrees, ff_dim_t{2_n}, std::nullopt)); - if (any_of(non_channel_degrees, [](int degree) { return degree != 1; })) { + if (any_of(non_channel_degrees, + [](nonnegative_int degree) { return degree != 1_n; })) { return fmt::format("Expected parallel degree of all non-channel dimensions " "to be 1, but received input with degrees {}", input_degrees); @@ -159,9 +156,9 @@ tl::expected relative_ff_dim_t channel_dim = relative_ff_dim_t{1}; return ParallelTensorDimDegrees{ - SumDegree{1}, - DiscardCopyDegree{1}, - FFOrdered{input_degrees.shard_degrees.at(channel_dim)}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, + FFOrdered{input_degrees.shard_degrees.at(channel_dim)}, }; } diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc index 3019151236..fc42241ef2 100644 --- a/lib/op-attrs/src/op-attrs/ops/concat.cc +++ b/lib/op-attrs/src/op-attrs/ops/concat.cc @@ -17,7 +17,8 @@ tl::expected get_output_shape(ConcatAttrs const &attrs, std::vector const &inputs) { auto get_non_axis_dims = [&](TensorShape const &s) { - std::map dim_sizes = enumerate(ff_ordered(s.dims)); + std::map dim_sizes = + enumerate(ff_ordered(s.dims)); dim_sizes.erase(attrs.axis); return dim_sizes; }; @@ -40,8 +41,8 @@ tl::expected inputs)); } - std::map non_axis_dims = ({ - tl::expected, std::string> returned = + std::map non_axis_dims = ({ + tl::expected, std::string> returned = require_all_same1(transform(inputs, get_non_axis_dims)); if (!returned.has_value()) { return tl::unexpected(returned.error()); @@ -49,12 +50,12 @@ tl::expected returned.value(); }); - std::vector axis_dim_sizes = + std::vector axis_dim_sizes = transform(inputs, [&](TensorShape const &s) { return dim_at_idx(s, relative_ff_dim_t_from_ff_dim_t(attrs.axis)); }); - size_t output_axis_dim_size = sum(axis_dim_sizes); + nonnegative_int output_axis_dim_size = sum(axis_dim_sizes); non_axis_dims.insert({attrs.axis, output_axis_dim_size}); @@ -88,7 +89,7 @@ tl::expected }); SumDegree sum_degree = ({ - tl::expected returned = + tl::expected returned = require_all_same1(transform(inputs, get_sum_degree)); if (!returned.has_value()) { return tl::unexpected(returned.error()); @@ -97,7 +98,7 @@ tl::expected }); DiscardCopyDegree discard_copy_degree = ({ - tl::expected returned = + tl::expected returned = require_all_same1(transform(inputs, get_discard_copy_degree)); if (!returned.has_value()) { return tl::unexpected(returned.error()); diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc index eac756cc15..d1ba536d24 100644 --- a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc +++ b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc @@ -25,11 +25,11 @@ TensorShape get_kernel_shape(Conv2DAttrs const &attrs, Conv2DInputShape input = parse_input_shape(raw_input_shape); return TensorShape{ - TensorDims{FFOrdered{ - size_t_from_int(attrs.out_channels), + TensorDims{FFOrdered{ + attrs.out_channels, input.num_channels, - size_t_from_int(attrs.kernel_h), - size_t_from_int(attrs.kernel_w), + attrs.kernel_h, + attrs.kernel_w, }}, input.datatype, }; @@ -42,29 +42,44 @@ TensorShape get_bias_shape(Conv2DAttrs const &attrs, return TensorShape{ TensorDims{ - FFOrdered{size_t_from_int(attrs.out_channels)}, + FFOrdered{attrs.out_channels}, }, input.datatype, }; } +static nonnegative_int calculate_output_size(nonnegative_int input_size, + nonnegative_int padding_size, + nonnegative_int kernel_size, + nonnegative_int stride) { + int input_size_raw = input_size.unwrap_nonnegative(); + int padding_raw = padding_size.unwrap_nonnegative(); + int kernel_size_raw = kernel_size.unwrap_nonnegative(); + int stride_raw = stride.unwrap_nonnegative(); + + return nonnegative_int{ + (input_size_raw + (2 * padding_raw) - kernel_size_raw) / stride_raw + 1}; +} + TensorShape get_output_shape(Conv2DAttrs const &attrs, TensorShape const &raw_input_shape) { assert(attrs.groups == 1); // TODO(@lockshaw): currently not supported Conv2DInputShape input = parse_input_shape(raw_input_shape); - size_t out_height = - (input.height + (2 * attrs.padding_h) - attrs.kernel_h) / attrs.stride_h + - 1; - size_t out_width = - (input.width + (2 * attrs.padding_w) - attrs.kernel_w) / attrs.stride_w + - 1; - - assert(attrs.out_channels > 0); - - return TensorShape{TensorDims{FFOrdered{ + nonnegative_int out_height = + calculate_output_size(/*input_size=*/input.height, + /*padding_size=*/attrs.padding_h, + /*kernel_size=*/attrs.kernel_h, + /*stride_size=*/attrs.stride_h); + nonnegative_int out_width = + calculate_output_size(/*input_size=*/input.width, + /*padding_size=*/attrs.padding_w, + /*kernel_size=*/attrs.kernel_w, + /*stride_size=*/attrs.stride_w); + + return TensorShape{TensorDims{FFOrdered{ input.num_samples, - size_t_from_int(attrs.out_channels), + attrs.out_channels, out_height, out_width, }}, @@ -82,14 +97,14 @@ ParallelTensorShape get_kernel_shape(Conv2DAttrs const &attrs, assert(parsed.height_dim.degree == 1); assert(parsed.width_dim.degree == 1); - SumDegree sum_degree = SumDegree{1}; + SumDegree sum_degree = SumDegree{1_n}; DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{parsed.sample_dim.degree * parsed.sum_reduction_degree}; - FFOrdered shard_degrees = { + FFOrdered shard_degrees = { parsed.discard_copy_reduction_degree, parsed.channel_dim.degree, - 1, - 1, + 1_n, + 1_n, }; return lift_to_parallel_with_degrees( @@ -109,7 +124,7 @@ ParallelTensorShape get_bias_shape(Conv2DAttrs const &attrs, DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{parsed.height_dim.degree * parsed.width_dim.degree * parsed.sample_dim.degree}; - FFOrdered shard_degrees = { + FFOrdered shard_degrees = { parsed.discard_copy_reduction_degree, }; @@ -130,12 +145,12 @@ ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs, SumDegree sum_degree = SumDegree{parsed.sum_reduction_degree * parsed.channel_dim.degree}; - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1}; - FFOrdered shard_degrees = { + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_n}; + FFOrdered shard_degrees = { parsed.sample_dim.degree, parsed.discard_copy_reduction_degree, - 1, - 1, + 1_n, + 1_n, }; return lift_to_parallel_with_degrees( diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc index aad067feb2..1491410491 100644 --- a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc +++ b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc @@ -6,10 +6,10 @@ namespace FlexFlow { Conv2DInputShape parse_input_shape(TensorShape const &input) { assert(num_dims(input) == 4); - size_t num_samples = dim_at_idx(input, relative_ff_dim_t{0}); - size_t in_channels = dim_at_idx(input, relative_ff_dim_t{1}); - size_t in_height = dim_at_idx(input, relative_ff_dim_t{2}); - size_t in_width = dim_at_idx(input, relative_ff_dim_t{3}); + nonnegative_int num_samples = dim_at_idx(input, relative_ff_dim_t{0}); + nonnegative_int in_channels = dim_at_idx(input, relative_ff_dim_t{1}); + nonnegative_int in_height = dim_at_idx(input, relative_ff_dim_t{2}); + nonnegative_int in_width = dim_at_idx(input, relative_ff_dim_t{3}); return Conv2DInputShape{ num_samples, diff --git a/lib/op-attrs/src/op-attrs/ops/embedding.cc b/lib/op-attrs/src/op-attrs/ops/embedding.cc index fe557695da..29bd70be2f 100644 --- a/lib/op-attrs/src/op-attrs/ops/embedding.cc +++ b/lib/op-attrs/src/op-attrs/ops/embedding.cc @@ -50,9 +50,9 @@ tl::expected return TensorShape{ TensorDims{ - FFOrdered{ - size_t_from_int(attrs.num_entries), - size_t_from_int(attrs.out_channels), + FFOrdered{ + attrs.num_entries, + attrs.out_channels, }, }, attrs.data_type, @@ -74,8 +74,8 @@ tl::expected SumDegree sum_degree = SumDegree{shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree}; - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1}; - FFOrdered shard_degrees = + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_n}; + FFOrdered shard_degrees = transform(input.dims.shard_dims, [](ShardParallelDim const &d) { return d.degree; }); shard_degrees.at(relative_ff_dim_t{-1}) = get_discard_copy_degree(input); @@ -96,13 +96,13 @@ tl::expected result_unpar.value(); }); - SumDegree sum_degree = SumDegree{1}; - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product( - transform(ff_ordered_shard_dims(input.dims), - [](ShardParallelDim const &d) -> int { return d.degree; }))}; - int entry_dim_degree = 1; - int out_channel_degree = get_discard_copy_degree(input); - FFOrdered shard_degrees = { + SumDegree sum_degree = SumDegree{1_n}; + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(transform( + ff_ordered_shard_dims(input.dims), + [](ShardParallelDim const &d) -> nonnegative_int { return d.degree; }))}; + nonnegative_int entry_dim_degree = 1_n; + nonnegative_int out_channel_degree = get_discard_copy_degree(input); + FFOrdered shard_degrees = { entry_dim_degree, out_channel_degree, }; diff --git a/lib/op-attrs/src/op-attrs/ops/flat.cc b/lib/op-attrs/src/op-attrs/ops/flat.cc index bc86102566..8ed12167b3 100644 --- a/lib/op-attrs/src/op-attrs/ops/flat.cc +++ b/lib/op-attrs/src/op-attrs/ops/flat.cc @@ -11,12 +11,11 @@ namespace FlexFlow { TensorShape get_output_shape(FlatAttrs const &attrs, TensorShape const &input_shape) { - FFOrdered leading_dims = slice(ff_ordered(input_shape.dims), - ff_dim_t{nonnegative_int{0}}, - attrs.start_dim); - FFOrdered flattened_dims = + FFOrdered leading_dims = + slice(ff_ordered(input_shape.dims), ff_dim_t{0_n}, attrs.start_dim); + FFOrdered flattened_dims = slice(ff_ordered(input_shape.dims), attrs.start_dim, attrs.end_dim); - FFOrdered trailing_dims = + FFOrdered trailing_dims = slice(ff_ordered(input_shape.dims), attrs.end_dim, std::nullopt); if (flattened_dims.empty()) { @@ -38,14 +37,15 @@ TensorShape get_output_shape(FlatAttrs const &attrs, tl::expected get_output_parallel_dim_degrees( FlatAttrs const &attrs, ParallelTensorDimDegrees const &input_degrees) { - FFOrdered flattened_dim_degrees = + FFOrdered flattened_dim_degrees = slice(input_degrees.shard_degrees, attrs.start_dim, attrs.end_dim); if (flattened_dim_degrees.empty()) { return input_degrees; } - if (any_of(flattened_dim_degrees, [](int degree) { return degree != 1; })) { + if (any_of(flattened_dim_degrees, + [](nonnegative_int degree) { return degree != 1; })) { return tl::unexpected( fmt::format("get_output_parallel_dim_degrees for {} expected all shard " "degrees of flattened dimensions to be 1, but received {}", @@ -58,9 +58,7 @@ tl::expected /*discard_copy_degree=*/input_degrees.discard_copy_degree, /*shard_degrees=*/ concat(std::vector{ - slice(input_degrees.shard_degrees, - ff_dim_t{nonnegative_int{0}}, - attrs.start_dim), + slice(input_degrees.shard_degrees, ff_dim_t{0_n}, attrs.start_dim), {product(flattened_dim_degrees)}, slice(input_degrees.shard_degrees, attrs.end_dim, std::nullopt), }), diff --git a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc index 86426dd18f..2394579e53 100644 --- a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc +++ b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc @@ -71,7 +71,7 @@ tl::expected std::vector non_layer_norm_dim_idxs = filter( get_idxs(input_shape.dims.ff_ordered), [&](ff_dim_t const &dim_idx) { return !contains(attrs.axes, dim_idx); }); - std::vector raw_weight_dims = + std::vector raw_weight_dims = transform(non_layer_norm_dim_idxs, [&](ff_dim_t const &dim_idx) { return dim_at_idx(input_shape, relative_ff_dim_t_from_ff_dim_t(dim_idx)); @@ -174,8 +174,8 @@ tl::expected ParallelTensorDims{ ff_ordered_of(raw_weight_shard_dims), ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc index e00a47d490..0387c143d7 100644 --- a/lib/op-attrs/src/op-attrs/ops/linear.cc +++ b/lib/op-attrs/src/op-attrs/ops/linear.cc @@ -41,11 +41,11 @@ RecordFormatter as_dot(LinearAttrs const &attrs) { tl::expected get_projection_shape(LinearAttrs const &attrs, TensorShape const &input_shape) { - size_t in_channels = dim_at_idx(input_shape, relative_ff_dim_t{-1}); + nonnegative_int in_channels = dim_at_idx(input_shape, relative_ff_dim_t{-1}); return TensorShape{ TensorDims{ - FFOrdered{in_channels, size_t_from_int(attrs.out_channels)}, + FFOrdered{in_channels, attrs.out_channels}, }, input_shape.data_type, }; @@ -55,7 +55,7 @@ tl::expected get_bias_shape(LinearAttrs const &attrs, TensorShape const &input_shape) { return TensorShape{ TensorDims{ - FFOrdered{size_t_from_int(attrs.out_channels)}, + FFOrdered{attrs.out_channels}, }, input_shape.data_type, }; @@ -64,8 +64,7 @@ tl::expected tl::expected get_output_shape(LinearAttrs const &attrs, TensorShape const &input_shape) { TensorShape output_shape = input_shape; - output_shape.dims.ff_ordered.at(relative_ff_dim_t{-1}) = - size_t_from_int(attrs.out_channels); + output_shape.dims.ff_ordered.at(relative_ff_dim_t{-1}) = attrs.out_channels; return output_shape; } @@ -82,12 +81,12 @@ tl::expected result_unpar.value(); }); - SumDegree sum_degree = SumDegree{1}; + SumDegree sum_degree = SumDegree{1_n}; DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{ get_sum_degree(input) * product(slice(ff_ordered_shard_degrees(input), std::nullopt, relative_ff_dim_t{-1}))}; - FFOrdered shard_degrees = FFOrdered{ + FFOrdered shard_degrees = FFOrdered{ shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree, get_discard_copy_degree(input), }; @@ -112,7 +111,8 @@ tl::expected shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree}; DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(slice( ff_ordered_shard_degrees(input), std::nullopt, relative_ff_dim_t{-1}))}; - FFOrdered shard_degrees = FFOrdered{get_discard_copy_degree(input)}; + FFOrdered shard_degrees = + FFOrdered{get_discard_copy_degree(input)}; return lift_to_parallel_with_degrees( unpar, sum_degree, discard_copy_degree, shard_degrees); @@ -133,8 +133,8 @@ tl::expected SumDegree sum_degree = SumDegree{get_sum_degree(input) * shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree}; - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1}; - FFOrdered shard_degrees = ff_ordered_shard_degrees(input); + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_n}; + FFOrdered shard_degrees = ff_ordered_shard_degrees(input); shard_degrees.at(relative_ff_dim_t{-1}) = get_discard_copy_degree(input); return lift_to_parallel_with_degrees( diff --git a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc index 86d287ebc8..f9630e16b1 100644 --- a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc +++ b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc @@ -8,8 +8,8 @@ namespace FlexFlow { tl::expected make_adaptive_pool2d_attrs(TensorDims const &input_dims, - int output_h, - int output_w, + nonnegative_int output_h, + nonnegative_int output_w, PoolOp pool_type, std::optional const &activation) { // AdaptivePool2D semantics pulled from @@ -22,10 +22,10 @@ tl::expected input_dims)); } - size_t num_samples = dim_at_idx(input_dims, relative_ff_dim_t{0}); - size_t num_channels = dim_at_idx(input_dims, relative_ff_dim_t{1}); - size_t input_h = dim_at_idx(input_dims, relative_ff_dim_t{2}); - size_t input_w = dim_at_idx(input_dims, relative_ff_dim_t{3}); + nonnegative_int num_samples = dim_at_idx(input_dims, relative_ff_dim_t{0}); + nonnegative_int num_channels = dim_at_idx(input_dims, relative_ff_dim_t{1}); + nonnegative_int input_h = dim_at_idx(input_dims, relative_ff_dim_t{2}); + nonnegative_int input_w = dim_at_idx(input_dims, relative_ff_dim_t{3}); if (input_h % output_h != 0) { return tl::unexpected(fmt::format( @@ -55,29 +55,29 @@ tl::expected // = `ind / outd` // = `stride` - int kernel_h = input_h / output_h; - int kernel_w = input_w / output_w; + nonnegative_int kernel_h = input_h / output_h; + nonnegative_int kernel_w = input_w / output_w; - int stride_h = kernel_h; - int stride_w = kernel_w; + nonnegative_int stride_h = kernel_h; + nonnegative_int stride_w = kernel_w; Pool2DAttrs attrs = Pool2DAttrs{ /*kernel_h=*/kernel_h, /*kernel_w=*/kernel_w, /*stride_h=*/stride_h, /*stride_w=*/stride_w, - /*padding_h=*/0, - /*padding_w=*/0, + /*padding_h=*/0_n, + /*padding_w=*/0_n, /*pool_type=*/pool_type, /*activation=*/activation, }; TensorShape expected_ouput_shape = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ num_samples, num_channels, - size_t_from_int(output_h), - size_t_from_int(output_w), + output_h, + output_w, }}, DataType::FLOAT, }; @@ -104,6 +104,19 @@ tl::expected return attrs; } +static nonnegative_int calculate_output_size(nonnegative_int input_size, + nonnegative_int padding_size, + nonnegative_int kernel_size, + nonnegative_int stride) { + int input_size_raw = input_size.unwrap_nonnegative(); + int padding_raw = padding_size.unwrap_nonnegative(); + int kernel_size_raw = kernel_size.unwrap_nonnegative(); + int stride_raw = stride.unwrap_nonnegative(); + + return nonnegative_int{ + (input_size_raw + (2 * padding_raw) - kernel_size_raw) / stride_raw + 1}; +} + tl::expected get_output_shape(Pool2DAttrs const &attrs, TensorShape const &input_shape) { if (num_dims(input_shape) != 4) { @@ -113,19 +126,23 @@ tl::expected input_shape)); } - size_t num_samples = dim_at_idx(input_shape, relative_ff_dim_t{0}); - size_t num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1}); - size_t input_height = dim_at_idx(input_shape, relative_ff_dim_t{2}); - size_t input_width = dim_at_idx(input_shape, relative_ff_dim_t{3}); - - size_t output_height = - (input_height + 2 * attrs.padding_h - attrs.kernel_h) / attrs.stride_h + - 1; - - size_t output_width = - (input_width + 2 * attrs.padding_w - attrs.kernel_w) / attrs.stride_w + 1; - - return TensorShape{TensorDims{FFOrdered{ + nonnegative_int num_samples = dim_at_idx(input_shape, relative_ff_dim_t{0}); + nonnegative_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1}); + nonnegative_int input_height = dim_at_idx(input_shape, relative_ff_dim_t{2}); + nonnegative_int input_width = dim_at_idx(input_shape, relative_ff_dim_t{3}); + + nonnegative_int output_height = + calculate_output_size(/*input_size=*/input_height, + /*padding_size=*/attrs.padding_h, + /*kernel_size=*/attrs.kernel_h, + /*stride_size=*/attrs.stride_h); + nonnegative_int output_width = + calculate_output_size(/*input_size=*/input_width, + /*padding_size=*/attrs.padding_w, + /*kernel_size=*/attrs.kernel_w, + /*stride_size=*/attrs.stride_w); + + return TensorShape{TensorDims{FFOrdered{ num_samples, num_channels, output_height, diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc index 0bb940924a..7a8f91e498 100644 --- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc +++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc @@ -7,9 +7,11 @@ #include "op-attrs/tensor_dims.h" #include "utils/containers/all_of.h" #include "utils/containers/product.h" +#include "utils/containers/repeat_element.h" #include "utils/containers/transform.h" #include "utils/containers/vector_of.h" #include "utils/integer_conversions.h" +#include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { @@ -17,7 +19,8 @@ FFOrdered ff_ordered_shard_dims(ParallelTensorDims const &d) { return d.shard_dims; } -FFOrdered ff_ordered_shard_degrees(ParallelTensorDims const &d) { +FFOrdered + ff_ordered_shard_degrees(ParallelTensorDims const &d) { return transform(d.shard_dims, [](ShardParallelDim const &d) { return d.degree; }); } @@ -27,8 +30,8 @@ std::unordered_set return get_replica_dims(d.replica_dims); } -size_t num_shard_dims(ParallelTensorDims const &dims) { - return dims.shard_dims.size(); +nonnegative_int num_shard_dims(ParallelTensorDims const &dims) { + return num_elements(dims.shard_dims); } ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorDims const &d) { @@ -40,22 +43,22 @@ ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorDims const &d) { } ParallelTensorDims lift_to_parallel(TensorDims const &dims) { - std::vector shard_degrees(num_dims(dims), - 1); // 1 repeated num_dims(dims) times + std::vector shard_degrees = + repeat_element(/*num_times=*/num_dims(dims), /*element=*/1_n); return lift_to_parallel_with_degrees( - dims, SumDegree{1}, DiscardCopyDegree{1}, shard_degrees); + dims, SumDegree{1_n}, DiscardCopyDegree{1_n}, shard_degrees); } -ParallelTensorDims - lift_to_parallel_with_degrees(TensorDims const &unpar, - SumDegree const &sum_degree, - DiscardCopyDegree const &discard_copy_degree, - FFOrdered const &shard_degrees) { +ParallelTensorDims lift_to_parallel_with_degrees( + TensorDims const &unpar, + SumDegree const &sum_degree, + DiscardCopyDegree const &discard_copy_degree, + FFOrdered const &shard_degrees) { std::vector lifted = transform(zip(vector_of(unpar.ff_ordered), vector_of(shard_degrees)), - [](std::pair const &p) { - size_t size = p.first; - int degree = p.second; + [](std::pair const &p) { + nonnegative_int size = p.first; + nonnegative_int degree = p.second; return ShardParallelDim{size, degree}; }); @@ -75,17 +78,17 @@ ParallelTensorDims degrees.shard_degrees); } -int total_replica_degree(ParallelTensorDims const &dims) { +nonnegative_int total_replica_degree(ParallelTensorDims const &dims) { return dims.replica_dims.discard_copy_degree.value * dims.replica_dims.sum_degree.value; } -int total_shard_degree(ParallelTensorDims const &dims) { +nonnegative_int total_shard_degree(ParallelTensorDims const &dims) { return product(transform(vector_of(dims.shard_dims), [](ShardParallelDim const &d) { return d.degree; })); } -int total_parallel_degree(ParallelTensorDims const &dims) { +nonnegative_int total_parallel_degree(ParallelTensorDims const &dims) { return total_replica_degree(dims) * total_shard_degree(dims); } @@ -115,7 +118,7 @@ TensorDims get_tensor_dims_unsafe(ParallelTensorDims const &) { } TensorDims get_reduced_dims(ParallelTensorDims const &dims) { - FFOrdered dim_sizes = transform( + FFOrdered dim_sizes = transform( dims.shard_dims, [](ShardParallelDim const &d) { return d.size; }); return TensorDims{dim_sizes}; } diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc index bbad13b46b..260ec7c3cd 100644 --- a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc +++ b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc @@ -6,11 +6,12 @@ #include "utils/containers/range.h" #include "utils/containers/transform.h" #include "utils/hash-utils.h" +#include "utils/nonnegative_int/nonnegative_range.h" #include "utils/overload.h" namespace FlexFlow { -int num_shard_dims(ParallelTensorShape const &s) { +nonnegative_int num_shard_dims(ParallelTensorShape const &s) { return num_shard_dims(s.dims); } @@ -19,21 +20,21 @@ std::unordered_set return replica_dims(s.dims); } -int get_num_replicas(ParallelTensorShape const &shape) { - return product( - transform(replica_dims(shape), - [](ReplicaParallelDim const &d) -> int { return d.degree; })); +nonnegative_int get_num_replicas(ParallelTensorShape const &shape) { + return product(transform( + replica_dims(shape), + [](ReplicaParallelDim const &d) -> nonnegative_int { return d.degree; })); } -int get_sum_degree(ParallelTensorShape const &shape) { +nonnegative_int get_sum_degree(ParallelTensorShape const &shape) { return shape.dims.replica_dims.sum_degree.value; } -int get_discard_copy_degree(ParallelTensorShape const &shape) { +nonnegative_int get_discard_copy_degree(ParallelTensorShape const &shape) { return shape.dims.replica_dims.discard_copy_degree.value; } -int get_total_parallel_degree(ParallelTensorShape const &s) { +nonnegative_int get_total_parallel_degree(ParallelTensorShape const &s) { return total_parallel_degree(s.dims); } @@ -51,7 +52,8 @@ ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &s, return shard_dim_at_idx(s.dims, d); } -FFOrdered ff_ordered_shard_degrees(ParallelTensorShape const &s) { +FFOrdered + ff_ordered_shard_degrees(ParallelTensorShape const &s) { return ff_ordered_shard_degrees(s.dims); } @@ -73,11 +75,11 @@ ParallelTensorShape lift_to_parallel(TensorShape const &s) { return ParallelTensorShape{lift_to_parallel(s.dims), s.data_type}; } -ParallelTensorShape - lift_to_parallel_with_degrees(TensorShape const &unpar, - SumDegree const &sum_degree, - DiscardCopyDegree const &discard_copy_degree, - FFOrdered const &shard_degrees) { +ParallelTensorShape lift_to_parallel_with_degrees( + TensorShape const &unpar, + SumDegree const &sum_degree, + DiscardCopyDegree const &discard_copy_degree, + FFOrdered const &shard_degrees) { return ParallelTensorShape{ lift_to_parallel_with_degrees( unpar.dims, sum_degree, discard_copy_degree, shard_degrees), @@ -95,8 +97,8 @@ ParallelTensorShape } TensorShape require_not_parallel(ParallelTensorShape const &s) { - int total_degree = get_total_parallel_degree(s); - if (total_degree != 1) { + nonnegative_int total_degree = get_total_parallel_degree(s); + if (total_degree != 1_n) { throw mk_runtime_error( fmt::format("Error: require_not_parallel received a parallel tensor " "shape with parallel degree {}: {}", @@ -124,25 +126,27 @@ TensorShape get_reduced_shape(ParallelTensorShape const &s) { ParallelDim get_parallel_dim_at_idx(ParallelTensorShape const &shape, parallel_tensor_dim_idx_t idx) { - return idx.visit( - overload{[&](ff_dim_t shard_dim) { - return ParallelDim{shape.dims.shard_dims.at(shard_dim)}; - }, - [&](ReplicaType replica_type) { - ReplicaParallelDimSet replicas = shape.dims.replica_dims; - int degree = (ReplicaType::SUM == replica_type - ? replicas.sum_degree.value - : replicas.discard_copy_degree.value); - return ParallelDim{ReplicaParallelDim{degree, replica_type}}; - }}); + return idx.visit(overload{ + [&](ff_dim_t shard_dim) { + return ParallelDim{shape.dims.shard_dims.at(shard_dim)}; + }, + [&](ReplicaType replica_type) { + ReplicaParallelDimSet replicas = shape.dims.replica_dims; + nonnegative_int degree = (ReplicaType::SUM == replica_type + ? replicas.sum_degree.value + : replicas.discard_copy_degree.value); + return ParallelDim{ReplicaParallelDim{degree, replica_type}}; + }}); } std::unordered_set get_parallel_tensor_dim_indices(ParallelTensorShape const &shape) { std::unordered_set indices; - extend(indices, transform(range(num_shard_dims(shape.dims)), [](int idx) { - return parallel_tensor_dim_idx_t{ff_dim_t{nonnegative_int{idx}}}; - })); + extend(indices, + transform(nonnegative_range(num_shard_dims(shape.dims)), + [](nonnegative_int idx) { + return parallel_tensor_dim_idx_t{ff_dim_t{idx}}; + })); indices.insert(parallel_tensor_dim_idx_t{ReplicaType::SUM}); indices.insert(parallel_tensor_dim_idx_t{ReplicaType::DISCARD_COPY}); return indices; diff --git a/lib/op-attrs/src/op-attrs/relative_ff_dim_t.cc b/lib/op-attrs/src/op-attrs/relative_ff_dim_t.cc index 0671bb05f2..a987841b18 100644 --- a/lib/op-attrs/src/op-attrs/relative_ff_dim_t.cc +++ b/lib/op-attrs/src/op-attrs/relative_ff_dim_t.cc @@ -3,10 +3,10 @@ namespace FlexFlow { ff_dim_t ff_dim_t_from_relative_ff_dim_t(relative_ff_dim_t ff_dim, - int input_dim) { + nonnegative_int input_dim) { int raw = ff_dim.value; if (raw < 0) { - raw = input_dim + raw; + raw = input_dim.unwrap_nonnegative() + raw; } return ff_dim_t{nonnegative_int{raw}}; } diff --git a/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc b/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc index 20c88c77dc..fc712be10b 100644 --- a/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc +++ b/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc @@ -4,11 +4,11 @@ namespace FlexFlow { ReplicaParallelDimSet empty_replica_parallel_dim_set() { - return ReplicaParallelDimSet{SumDegree{1}, DiscardCopyDegree{1}}; + return ReplicaParallelDimSet{SumDegree{1_n}, DiscardCopyDegree{1_n}}; } -int get_order_of_replica_type(ReplicaParallelDimSet const &s, - ReplicaType replica_type) { +nonnegative_int get_degree_of_replica_type(ReplicaParallelDimSet const &s, + ReplicaType replica_type) { switch (replica_type) { case ReplicaType::SUM: return s.sum_degree.value; diff --git a/lib/op-attrs/src/op-attrs/tensor_dims.cc b/lib/op-attrs/src/op-attrs/tensor_dims.cc index f0ac88d8e4..f9198bbe28 100644 --- a/lib/op-attrs/src/op-attrs/tensor_dims.cc +++ b/lib/op-attrs/src/op-attrs/tensor_dims.cc @@ -8,22 +8,23 @@ #include "utils/containers/vector_of.h" #include "utils/containers/zip.h" #include "utils/integer_conversions.h" +#include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { -FFOrdered const &ff_ordered(TensorDims const &dims) { +FFOrdered const &ff_ordered(TensorDims const &dims) { return dims.ff_ordered; } -size_t num_dims(TensorDims const &dims) { - return dims.ff_ordered.size(); +nonnegative_int num_dims(TensorDims const &dims) { + return num_elements(dims.ff_ordered); } -size_t dim_at_idx(TensorDims const &dims, relative_ff_dim_t idx) { +nonnegative_int dim_at_idx(TensorDims const &dims, relative_ff_dim_t idx) { return dims.ff_ordered.at(idx); } -size_t &dim_at_idx(TensorDims &dims, relative_ff_dim_t idx) { +nonnegative_int &dim_at_idx(TensorDims &dims, relative_ff_dim_t idx) { return dims.ff_ordered.at(idx); } @@ -33,8 +34,8 @@ bool tensor_dims_is_broadcastable_to(TensorDims const &curr, return false; } - std::vector curr_dims = vector_of(curr.ff_ordered); - std::vector goal_dims = vector_of(goal.ff_ordered); + std::vector curr_dims = vector_of(curr.ff_ordered); + std::vector goal_dims = vector_of(goal.ff_ordered); for (auto const &[curr_dim, goal_dim] : zip(reversed(curr_dims), reversed(goal_dims))) { diff --git a/lib/op-attrs/src/op-attrs/tensor_shape.cc b/lib/op-attrs/src/op-attrs/tensor_shape.cc index 70ed58aac6..690a07d26a 100644 --- a/lib/op-attrs/src/op-attrs/tensor_shape.cc +++ b/lib/op-attrs/src/op-attrs/tensor_shape.cc @@ -4,26 +4,27 @@ #include "utils/containers/get_only.h" #include "utils/containers/product.h" #include "utils/containers/transform.h" +#include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { -size_t num_dims(TensorShape const &s) { - return s.dims.ff_ordered.size(); +nonnegative_int num_dims(TensorShape const &s) { + return num_elements(s.dims.ff_ordered); } -size_t dim_at_idx(TensorShape const &s, relative_ff_dim_t idx) { +nonnegative_int dim_at_idx(TensorShape const &s, relative_ff_dim_t idx) { return dim_at_idx(s.dims, idx); } -size_t &dim_at_idx(TensorShape &s, relative_ff_dim_t idx) { +nonnegative_int &dim_at_idx(TensorShape &s, relative_ff_dim_t idx) { return dim_at_idx(s.dims, idx); } -size_t get_num_elements(TensorShape const &s) { +nonnegative_int get_num_elements(TensorShape const &s) { return product(s.dims.ff_ordered); } -size_t get_size_in_bytes(TensorShape const &s) { +nonnegative_int get_size_in_bytes(TensorShape const &s) { return get_num_elements(s) * size_of_datatype(s.data_type); } diff --git a/lib/op-attrs/test/src/op-attrs/ops/attention.cc b/lib/op-attrs/test/src/op-attrs/ops/attention.cc index eca8559b21..b317c5c69c 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/attention.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/attention.cc @@ -10,10 +10,10 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_attention_incoming_tensor_roles(MultiHeadAttentionAttrs)") { auto make_attrs = [](bool bias) { return MultiHeadAttentionAttrs{ - /*embed_dim=*/32, - /*num_heads=*/10, - /*kdim=*/32, - /*vdim=*/32, + /*embed_dim=*/32_n, + /*num_heads=*/10_n, + /*kdim=*/32_n, + /*vdim=*/32_n, /*dropout=*/0.0, /*bias=*/bias, /*add_bias_kv=*/false, @@ -58,8 +58,8 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_output_shape(MultiHeadAttentionAttrs, TensorShape, " "TensorShape, TensorShape)") { - int embed_dim = 32; - int num_heads = 10; + nonnegative_int embed_dim = 32_n; + nonnegative_int num_heads = 10_n; /* Parameter meanings match those at * https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html @@ -75,13 +75,13 @@ TEST_SUITE(FF_TEST_SUITE) { /*add_zero_attn=*/false, }; - size_t batch_size = 40; - size_t seq_len = 48; - size_t feature_size = 36; + nonnegative_int batch_size = 40_n; + nonnegative_int seq_len = 48_n; + nonnegative_int feature_size = 36_n; TensorShape input_q = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, seq_len, feature_size, @@ -92,7 +92,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_k = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, seq_len, feature_size, @@ -103,7 +103,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_v = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, seq_len, feature_size, @@ -114,10 +114,10 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, seq_len, - size_t_from_int(attrs.embed_dim), + attrs.embed_dim, }, }, DataType::FLOAT, @@ -125,9 +125,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape weights = TensorShape{ TensorDims{ - FFOrdered{ - (feature_size * embed_dim) * 3 + (embed_dim * embed_dim), - size_t_from_int(num_heads), + FFOrdered{ + (feature_size * embed_dim) * 3_n + (embed_dim * embed_dim), + num_heads, }, }, DataType::FLOAT, @@ -135,8 +135,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_bias = TensorShape{ TensorDims{ - FFOrdered{ - size_t_from_int(embed_dim * 3), + FFOrdered{ + embed_dim * 3_n, }, }, DataType::FLOAT, @@ -144,8 +144,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_bias = TensorShape{ TensorDims{ - FFOrdered{ - size_t_from_int(embed_dim), + FFOrdered{ + embed_dim, }, }, DataType::FLOAT, @@ -184,72 +184,94 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("parallel shape inference") { auto make_q = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_batch, - int o_seq_len, - int o_q) { + nonnegative_int o_batch, + nonnegative_int o_seq_len, + nonnegative_int o_q) { return lift_to_parallel_with_degrees( - input_q, o_sum, o_eq, FFOrdered{o_batch, o_seq_len, o_q}); + input_q, + o_sum, + o_eq, + FFOrdered{o_batch, o_seq_len, o_q}); }; auto make_k = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_batch, - int o_seq_len, - int o_k) { + nonnegative_int o_batch, + nonnegative_int o_seq_len, + nonnegative_int o_k) { return lift_to_parallel_with_degrees( - input_k, o_sum, o_eq, FFOrdered{o_batch, o_seq_len, o_k}); + input_k, + o_sum, + o_eq, + FFOrdered{o_batch, o_seq_len, o_k}); }; auto make_v = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_batch, - int o_seq_len, - int o_v) { + nonnegative_int o_batch, + nonnegative_int o_seq_len, + nonnegative_int o_v) { return lift_to_parallel_with_degrees( - input_v, o_sum, o_eq, FFOrdered{o_batch, o_seq_len, o_v}); + input_v, + o_sum, + o_eq, + FFOrdered{o_batch, o_seq_len, o_v}); }; auto make_o = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_batch, - int o_seq_len, - int o_o) { + nonnegative_int o_batch, + nonnegative_int o_seq_len, + nonnegative_int o_o) { return lift_to_parallel_with_degrees( - output, o_sum, o_eq, FFOrdered{o_batch, o_seq_len, o_o}); + output, + o_sum, + o_eq, + FFOrdered{o_batch, o_seq_len, o_o}); }; - auto make_w = - [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o_e, int o_h) { - return lift_to_parallel_with_degrees( - weights, o_sum, o_eq, FFOrdered{o_e, o_h}); - }; + auto make_w = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o_e, + nonnegative_int o_h) { + return lift_to_parallel_with_degrees( + weights, o_sum, o_eq, FFOrdered{o_e, o_h}); + }; - auto make_input_bias = - [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o_in_proj_channel) { - return lift_to_parallel_with_degrees( - input_bias, o_sum, o_eq, FFOrdered{o_in_proj_channel}); - }; + auto make_input_bias = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o_in_proj_channel) { + return lift_to_parallel_with_degrees( + input_bias, + o_sum, + o_eq, + FFOrdered{o_in_proj_channel}); + }; - auto make_output_bias = - [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o_out_proj_channel) { - return lift_to_parallel_with_degrees( - output_bias, o_sum, o_eq, FFOrdered{o_out_proj_channel}); - }; + auto make_output_bias = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o_out_proj_channel) { + return lift_to_parallel_with_degrees( + output_bias, + o_sum, + o_eq, + FFOrdered{o_out_proj_channel}); + }; SUBCASE("data parallelism") { - int o_b = 4; + nonnegative_int o_b = 4_n; ParallelTensorShape q = - make_q(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1); + make_q(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n); ParallelTensorShape k = - make_k(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1); + make_k(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n); ParallelTensorShape v = - make_v(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1); + make_v(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n); SUBCASE("get_output_shape") { tl::expected result = get_output_shape(attrs, q, k, v); tl::expected correct = - make_o(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1); + make_o(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n); CHECK(result == correct); } @@ -257,7 +279,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_weights_shape(attrs, q, k, v); tl::expected correct = - make_w(SumDegree{1}, DiscardCopyDegree{o_b}, 1, 1); + make_w(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n, 1_n); CHECK(result == correct); } @@ -265,7 +287,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_input_bias_shape(attrs, q, k, v); tl::expected correct = - make_input_bias(SumDegree{1}, DiscardCopyDegree{o_b}, 1); + make_input_bias(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n); CHECK(result == correct); } @@ -273,25 +295,25 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_output_bias_shape(attrs, q, k, v); tl::expected correct = - make_output_bias(SumDegree{1}, DiscardCopyDegree{o_b}, 1); + make_output_bias(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n); CHECK(result == correct); } } SUBCASE("attention head parallelism") { - int o_h = 2; + nonnegative_int o_h = 2_n; ParallelTensorShape q = - make_q(SumDegree{1}, DiscardCopyDegree{o_h}, 1, 1, 1); + make_q(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n, 1_n, 1_n); ParallelTensorShape k = - make_k(SumDegree{1}, DiscardCopyDegree{o_h}, 1, 1, 1); + make_k(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n, 1_n, 1_n); ParallelTensorShape v = - make_v(SumDegree{1}, DiscardCopyDegree{o_h}, 1, 1, 1); + make_v(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n, 1_n, 1_n); SUBCASE("get_output_shape") { tl::expected result = get_output_shape(attrs, q, k, v); tl::expected correct = - make_o(SumDegree{o_h}, DiscardCopyDegree{1}, 1, 1, 1); + make_o(SumDegree{o_h}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); CHECK(result == correct); } @@ -299,7 +321,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_weights_shape(attrs, q, k, v); tl::expected correct = - make_w(SumDegree{1}, DiscardCopyDegree{1}, 1, o_h); + make_w(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_h); CHECK(result == correct); } @@ -307,7 +329,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_input_bias_shape(attrs, q, k, v); tl::expected correct = - make_input_bias(SumDegree{1}, DiscardCopyDegree{o_h}, 1); + make_input_bias(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n); CHECK(result == correct); } @@ -315,26 +337,26 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_output_bias_shape(attrs, q, k, v); tl::expected correct = - make_output_bias(SumDegree{1}, DiscardCopyDegree{o_h}, 1); + make_output_bias(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n); CHECK(result == correct); } } SUBCASE("combined data & attention head parallelism") { - int o_b = 4; - int o_h = 2; + nonnegative_int o_b = 4_n; + nonnegative_int o_h = 2_n; ParallelTensorShape q = - make_q(SumDegree{1}, DiscardCopyDegree{o_h}, o_b, 1, 1); + make_q(SumDegree{1_n}, DiscardCopyDegree{o_h}, o_b, 1_n, 1_n); ParallelTensorShape k = - make_k(SumDegree{1}, DiscardCopyDegree{o_h}, o_b, 1, 1); + make_k(SumDegree{1_n}, DiscardCopyDegree{o_h}, o_b, 1_n, 1_n); ParallelTensorShape v = - make_v(SumDegree{1}, DiscardCopyDegree{o_h}, o_b, 1, 1); + make_v(SumDegree{1_n}, DiscardCopyDegree{o_h}, o_b, 1_n, 1_n); SUBCASE("get_output_shape") { tl::expected result = get_output_shape(attrs, q, k, v); tl::expected correct = - make_o(SumDegree{o_h}, DiscardCopyDegree{1}, o_b, 1, 1); + make_o(SumDegree{o_h}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n); CHECK(result == correct); } @@ -342,7 +364,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_weights_shape(attrs, q, k, v); tl::expected correct = - make_w(SumDegree{1}, DiscardCopyDegree{o_b}, 1, o_h); + make_w(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n, o_h); CHECK(result == correct); } @@ -350,7 +372,8 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_input_bias_shape(attrs, q, k, v); tl::expected correct = - make_input_bias(SumDegree{1}, DiscardCopyDegree{o_b * o_h}, 1); + make_input_bias( + SumDegree{1_n}, DiscardCopyDegree{o_b * o_h}, 1_n); CHECK(result == correct); } @@ -358,7 +381,8 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_output_bias_shape(attrs, q, k, v); tl::expected correct = - make_output_bias(SumDegree{1}, DiscardCopyDegree{o_b * o_h}, 1); + make_output_bias( + SumDegree{1_n}, DiscardCopyDegree{o_b * o_h}, 1_n); CHECK(result == correct); } } diff --git a/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc b/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc index 56a2e3fa52..27c59ee497 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc @@ -6,20 +6,20 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_output_shape(BatchMatmulAttrs, TensorShape)") { - size_t b = 4; - size_t m = 6; - size_t n = 8; - size_t p = 10; + nonnegative_int b = 4_n; + nonnegative_int m = 6_n; + nonnegative_int n = 8_n; + nonnegative_int p = 10_n; BatchMatmulAttrs attrs = BatchMatmulAttrs{ - /*a_seq_length_dim=*/0, // TODO figure out if these arguments are still - // relevant - /*b_seq_length_dim=*/0, + /*a_seq_length_dim=*/0_n, // TODO figure out if these arguments are + // still relevant + /*b_seq_length_dim=*/0_n, }; TensorShape input_lhs_shape = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ b, n, m, @@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("valid") { TensorShape input_rhs_shape = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ b, m, p, @@ -45,7 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected correct_output_shape = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ b, n, p, @@ -60,8 +60,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("mismatched b") { TensorShape input_rhs_shape = TensorShape{ TensorDims{ - FFOrdered{ - b + 1, + FFOrdered{ + b + 1_n, m, p, }, @@ -78,9 +78,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("mismatched m") { TensorShape input_rhs_shape = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ b, - m + 1, + m + 1_n, p, }, }, @@ -95,27 +95,27 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("get_output_shape(BatchMatmulAttrs, ParallelTensorShape)") { - size_t b = 2 * 2; - int o_b = 2; - size_t m = 3 * 3; - int o_m = 3; - size_t n = 5 * 5; - int o_n = 5; - size_t p = 7 * 7; - int o_p = 7; - int o_sum = 11; + nonnegative_int b = 2_n * 2_n; + nonnegative_int o_b = 2_n; + nonnegative_int m = 3_n * 3_n; + nonnegative_int o_m = 3_n; + nonnegative_int n = 5_n * 5_n; + nonnegative_int o_n = 5_n; + nonnegative_int p = 7_n * 7_n; + nonnegative_int o_p = 7_n; + nonnegative_int o_sum = 11_n; BatchMatmulAttrs attrs = BatchMatmulAttrs{ - /*a_seq_length_dim=*/0, // TODO figure out if these arguments are still - // relevant - /*b_seq_length_dim=*/0, + /*a_seq_length_dim=*/0_n, // TODO figure out if these arguments are + // still relevant + /*b_seq_length_dim=*/0_n, }; auto make_lhs = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_b, - int o_n, - int o_m) { + nonnegative_int o_b, + nonnegative_int o_n, + nonnegative_int o_m) { return ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ @@ -134,9 +134,9 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_rhs = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_b, - int o_m, - int o_p) { + nonnegative_int o_b, + nonnegative_int o_m, + nonnegative_int o_p) { return ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ @@ -155,9 +155,9 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_b, - int o_n, - int o_p) { + nonnegative_int o_b, + nonnegative_int o_n, + nonnegative_int o_p) { return ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ @@ -177,10 +177,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("data parallel") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1), - make_rhs(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1)); + make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n), + make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n)); tl::expected correct = - make_output(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1); + make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n); CHECK(result == correct); } @@ -188,10 +188,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("n parallel") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{1}, DiscardCopyDegree{1}, 1, o_n, 1), - make_rhs(SumDegree{1}, DiscardCopyDegree{o_n}, 1, 1, 1)); + make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n), + make_rhs(SumDegree{1_n}, DiscardCopyDegree{o_n}, 1_n, 1_n, 1_n)); tl::expected correct = - make_output(SumDegree{1}, DiscardCopyDegree{1}, 1, o_n, 1); + make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n); CHECK(result == correct); } @@ -199,10 +199,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("p parallel") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{1}, DiscardCopyDegree{o_p}, 1, 1, 1), - make_rhs(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, o_p)); + make_lhs(SumDegree{1_n}, DiscardCopyDegree{o_p}, 1_n, 1_n, 1_n), + make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, o_p)); tl::expected correct = - make_output(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, o_p); + make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, o_p); CHECK(result == correct); } @@ -210,10 +210,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("reduction parallel") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, o_m), - make_rhs(SumDegree{1}, DiscardCopyDegree{1}, 1, o_m, 1)); + make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, o_m), + make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_m, 1_n)); tl::expected correct = - make_output(SumDegree{o_m}, DiscardCopyDegree{1}, 1, 1, 1); + make_output(SumDegree{o_m}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); CHECK(result == correct); } @@ -221,10 +221,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("propagate reduction lhs") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1), - make_rhs(SumDegree{1}, DiscardCopyDegree{o_sum}, 1, 1, 1)); + make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), + make_rhs(SumDegree{1_n}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n)); tl::expected correct = - make_output(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1); + make_output(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); CHECK(result == correct); } @@ -232,10 +232,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("propagate reduction rhs") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{1}, DiscardCopyDegree{o_sum}, 1, 1, 1), - make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1)); + make_lhs(SumDegree{1_n}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n), + make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n)); tl::expected correct = - make_output(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1); + make_output(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); CHECK(result == correct); } @@ -243,10 +243,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("reduction lhs & reduction rhs") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1, 1, 1), - make_rhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1, 1, 1)); - tl::expected correct = - make_output(SumDegree{o_sum * o_sum}, DiscardCopyDegree{1}, 1, 1, 1); + make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n), + make_rhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n)); + tl::expected correct = make_output( + SumDegree{o_sum * o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); CHECK(result == correct); } @@ -254,8 +254,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("reduction lhs & rhs (invalid)") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1), - make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1)); + make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), + make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n)); CHECK_MESSAGE( !result.has_value(), "Unexpected successful value: ", result); @@ -264,10 +264,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("reduction lhs & n") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, o_n, 1), - make_rhs(SumDegree{1}, DiscardCopyDegree{o_sum * o_n}, 1, 1, 1)); + make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n), + make_rhs( + SumDegree{1_n}, DiscardCopyDegree{o_sum * o_n}, 1_n, 1_n, 1_n)); tl::expected correct = - make_output(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, o_n, 1); + make_output(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n); CHECK(result == correct); } @@ -275,10 +276,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("reduction lhs & reduction rhs & n") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1, o_n, 1), - make_rhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1, 1, 1)); + make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, o_n, 1_n), + make_rhs( + SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1_n, 1_n, 1_n)); tl::expected correct = make_output( - SumDegree{o_sum * o_sum}, DiscardCopyDegree{1}, 1, o_n, 1); + SumDegree{o_sum * o_sum}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n); CHECK(result == correct); } @@ -286,11 +288,15 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("reduction lhs & reduction rhs & n & m") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1, o_n, o_m), + make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, o_n, o_m), make_rhs( - SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1, o_m, 1)); - tl::expected correct = make_output( - SumDegree{o_sum * o_sum * o_m}, DiscardCopyDegree{1}, 1, o_n, 1); + SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1_n, o_m, 1_n)); + tl::expected correct = + make_output(SumDegree{o_sum * o_sum * o_m}, + DiscardCopyDegree{1_n}, + 1_n, + o_n, + 1_n); CHECK(result == correct); } diff --git a/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc index 4196394d00..cd9796945c 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc @@ -60,11 +60,11 @@ TEST_SUITE(FF_TEST_SUITE) { }(); TensorShape input = TensorShape{ - TensorDims{FFOrdered{ - 12, - 14, - 16, - 18, + TensorDims{FFOrdered{ + 12_n, + 14_n, + 16_n, + 18_n, }}, DataType::FLOAT, }; @@ -72,8 +72,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output = input; TensorShape gamma = TensorShape{ - TensorDims{FFOrdered{ - 14, + TensorDims{FFOrdered{ + 14_n, }}, DataType::FLOAT, }; @@ -140,16 +140,16 @@ TEST_SUITE(FF_TEST_SUITE) { }(); SUBCASE("partition parallelism (in channel dim)") { - int degree = 2; + nonnegative_int degree = 2_n; ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1}, - DiscardCopyDegree{1}, - FFOrdered{ - 1, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, + FFOrdered{ + 1_n, degree, - 1, - 1, + 1_n, + 1_n, }, }; @@ -169,9 +169,9 @@ TEST_SUITE(FF_TEST_SUITE) { get_gamma_weights_parallel_dim_degrees(attrs_affine_true, input); tl::expected correct = ParallelTensorDimDegrees{ - SumDegree{1}, - DiscardCopyDegree{1}, - FFOrdered{degree}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, + FFOrdered{degree}, }; CHECK(result == correct); @@ -194,9 +194,9 @@ TEST_SUITE(FF_TEST_SUITE) { get_beta_weights_parallel_dim_degrees(attrs_affine_true, input); tl::expected correct = ParallelTensorDimDegrees{ - SumDegree{1}, - DiscardCopyDegree{1}, - FFOrdered{degree}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, + FFOrdered{degree}, }; CHECK(result == correct); @@ -214,12 +214,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("partition parallelism (not in channel dim)") { - int degree = 2; + nonnegative_int degree = 2_n; ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1}, - DiscardCopyDegree{1}, - FFOrdered{1, 1, degree, 1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, + FFOrdered{1_n, 1_n, degree, 1_n}, }; SUBCASE("get_output_parallel_dim_degrees(BatchNormAttrs, " @@ -251,12 +251,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("sum parallelism") { - SumDegree sum_degree = SumDegree{2}; + SumDegree sum_degree = SumDegree{2_n}; ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ sum_degree, - DiscardCopyDegree{1}, - FFOrdered{1, 1, 1, 1}, + DiscardCopyDegree{1_n}, + FFOrdered{1_n, 1_n, 1_n, 1_n}, }; SUBCASE("get_output_parallel_dim_degrees(BatchNormAttrs, " @@ -288,12 +288,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("discard copy parallelism") { - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2}; + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n}; ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1}, + SumDegree{1_n}, discard_copy_degree, - FFOrdered{1, 1, 1, 1}, + FFOrdered{1_n, 1_n, 1_n, 1_n}, }; SUBCASE("get_output_parallel_dim_degrees(BatchNormAttrs, " @@ -340,14 +340,14 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{12, 1}, - ShardParallelDim{14, 2}, - ShardParallelDim{16, 1}, - ShardParallelDim{18, 1}, + ShardParallelDim{12_n, 1_n}, + ShardParallelDim{14_n, 2_n}, + ShardParallelDim{16_n, 1_n}, + ShardParallelDim{18_n, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -368,11 +368,11 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{14, 2}, + ShardParallelDim{14_n, 2_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -388,11 +388,11 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{14, 2}, + ShardParallelDim{14_n, 2_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, diff --git a/lib/op-attrs/test/src/op-attrs/ops/cast.cc b/lib/op-attrs/test/src/op-attrs/ops/cast.cc index c7395316ad..e9ec890b4b 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/cast.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/cast.cc @@ -12,15 +12,15 @@ TEST_SUITE(FF_TEST_SUITE) { CastAttrs attrs = CastAttrs{output_datatype}; - size_t d1 = 12; - size_t d2 = 16; + nonnegative_int d1 = 12_n; + nonnegative_int d2 = 16_n; TensorShape input = TensorShape{ - TensorDims{FFOrdered{d1, d2}}, + TensorDims{FFOrdered{d1, d2}}, input_datatype, }; TensorShape output = TensorShape{ - TensorDims{FFOrdered{d1, d2}}, + TensorDims{FFOrdered{d1, d2}}, output_datatype, }; @@ -34,24 +34,30 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("get_output_shape(CastAttrs, ParallelTensorShape)") { auto make_input = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_batch, - int o_features) { + nonnegative_int o_batch, + nonnegative_int o_features) { return lift_to_parallel_with_degrees( - input, o_sum, o_eq, FFOrdered{o_batch, o_features}); + input, + o_sum, + o_eq, + FFOrdered{o_batch, o_features}); }; auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_batch, - int o_outchannels) { + nonnegative_int o_batch, + nonnegative_int o_outchannels) { return lift_to_parallel_with_degrees( - output, o_sum, o_eq, FFOrdered{o_batch, o_outchannels}); + output, + o_sum, + o_eq, + FFOrdered{o_batch, o_outchannels}); }; - SumDegree sum_degree = SumDegree{2}; - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{3}; - int batch_degree = 4; - int feature_degree = 8; + SumDegree sum_degree = SumDegree{2_n}; + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{3_n}; + nonnegative_int batch_degree = 4_n; + nonnegative_int feature_degree = 8_n; ParallelTensorShape par_input = make_input( sum_degree, discard_copy_degree, batch_degree, feature_degree); diff --git a/lib/op-attrs/test/src/op-attrs/ops/combine.cc b/lib/op-attrs/test/src/op-attrs/ops/combine.cc index 577961b7b1..14fbca5b3a 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/combine.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/combine.cc @@ -10,22 +10,22 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{12, 2}, - ShardParallelDim{14, 1}, - ShardParallelDim{16, 3}, - ShardParallelDim{18, 2}, + ShardParallelDim{12_n, 2_n}, + ShardParallelDim{14_n, 1_n}, + ShardParallelDim{16_n, 3_n}, + ShardParallelDim{18_n, 2_n}, }, ReplicaParallelDimSet{ - SumDegree{3}, - DiscardCopyDegree{2}, + SumDegree{3_n}, + DiscardCopyDegree{2_n}, }, }, DataType::FLOAT, }; SUBCASE("valid") { - ff_dim_t dim = ff_dim_t{nonnegative_int{2}}; - int degree = 3; + ff_dim_t dim = ff_dim_t{2_n}; + nonnegative_int degree = 3_n; CombineAttrs attrs = CombineAttrs{ /*repartition_dim=*/dim, /*repartition_degree=*/degree, @@ -44,8 +44,8 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("invalid") { - ff_dim_t dim = ff_dim_t{nonnegative_int{2}}; - int degree = 4; + ff_dim_t dim = ff_dim_t{2_n}; + nonnegative_int degree = 4_n; CombineAttrs attrs = CombineAttrs{ /*repartition_dim=*/dim, /*repartition_degree=*/degree, diff --git a/lib/op-attrs/test/src/op-attrs/ops/concat.cc b/lib/op-attrs/test/src/op-attrs/ops/concat.cc index 2d9842b1dd..b84cf38753 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/concat.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/concat.cc @@ -23,12 +23,12 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } - size_t dim0_size = 12; - size_t dim2_size = 20; + nonnegative_int dim0_size = 12_n; + nonnegative_int dim2_size = 20_n; TensorShape input_shape1 = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ dim0_size, - 14, + 14_n, dim2_size, }}, DataType::FLOAT, @@ -45,26 +45,26 @@ TEST_SUITE(FF_TEST_SUITE) { } TensorShape input_shape2 = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ dim0_size, - 16, + 16_n, dim2_size, }}, DataType::FLOAT, }; TensorShape input_shape3 = TensorShape{ - TensorDims{FFOrdered{dim0_size, 18, dim2_size}}, + TensorDims{FFOrdered{dim0_size, 18_n, dim2_size}}, DataType::FLOAT, }; SUBCASE("input shapes do not shared the same num_dims") { TensorShape mismatched_num_dims = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ dim0_size, - 20, + 20_n, dim2_size, - 1, + 1_n, }}, DataType::FLOAT, }; @@ -101,9 +101,9 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_output_shape(attrs, input_shapes); tl::expected correct = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ dim0_size, - 14 + 16 + 18, + 14_n + 16_n + 18_n, dim2_size, }}, DataType::FLOAT, @@ -118,84 +118,97 @@ TEST_SUITE(FF_TEST_SUITE) { ff_dim_t{nonnegative_int{1}}, }; - size_t dim0_size = 12; - size_t dim2_size = 20; + nonnegative_int dim0_size = 12_n; + nonnegative_int dim2_size = 20_n; TensorShape input_shape1 = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ dim0_size, - 14, + 14_n, dim2_size, }}, DataType::FLOAT, }; TensorShape input_shape2 = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ dim0_size, - 16, + 16_n, dim2_size, }}, DataType::FLOAT, }; TensorShape input_shape3 = TensorShape{ - TensorDims{FFOrdered{dim0_size, 18, dim2_size}}, + TensorDims{FFOrdered{dim0_size, 18_n, dim2_size}}, DataType::FLOAT, }; TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{dim0_size, 14 + 16 + 18, dim2_size}}, + TensorDims{FFOrdered{ + dim0_size, 14_n + 16_n + 18_n, dim2_size}}, DataType::FLOAT, }; - auto lift_input1 = - [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) { - return lift_to_parallel_with_degrees( - input_shape1, o_sum, o_eq, FFOrdered{o0, o1, o2}); - }; + auto lift_input1 = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o0, + nonnegative_int o1, + nonnegative_int o2) { + return lift_to_parallel_with_degrees( + input_shape1, o_sum, o_eq, FFOrdered{o0, o1, o2}); + }; - auto lift_input2 = - [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) { - return lift_to_parallel_with_degrees( - input_shape2, o_sum, o_eq, FFOrdered{o0, o1, o2}); - }; + auto lift_input2 = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o0, + nonnegative_int o1, + nonnegative_int o2) { + return lift_to_parallel_with_degrees( + input_shape2, o_sum, o_eq, FFOrdered{o0, o1, o2}); + }; - auto lift_input3 = - [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) { - return lift_to_parallel_with_degrees( - input_shape3, o_sum, o_eq, FFOrdered{o0, o1, o2}); - }; + auto lift_input3 = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o0, + nonnegative_int o1, + nonnegative_int o2) { + return lift_to_parallel_with_degrees( + input_shape3, o_sum, o_eq, FFOrdered{o0, o1, o2}); + }; - auto lift_output = - [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) { - return lift_to_parallel_with_degrees( - output_shape, o_sum, o_eq, FFOrdered{o0, o1, o2}); - }; + auto lift_output = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o0, + nonnegative_int o1, + nonnegative_int o2) { + return lift_to_parallel_with_degrees( + output_shape, o_sum, o_eq, FFOrdered{o0, o1, o2}); + }; SUBCASE("sum reduction parallelism") { SUBCASE("matching") { - SumDegree sum_degree = SumDegree{2}; + SumDegree sum_degree = SumDegree{2_n}; std::vector inputs = { - lift_input1(sum_degree, DiscardCopyDegree{1}, 1, 1, 1), - lift_input2(sum_degree, DiscardCopyDegree{1}, 1, 1, 1), - lift_input3(sum_degree, DiscardCopyDegree{1}, 1, 1, 1), + lift_input1(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), + lift_input2(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), + lift_input3(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), }; tl::expected result = get_output_shape(attrs, inputs); tl::expected correct = - lift_output(sum_degree, DiscardCopyDegree{1}, 1, 1, 1); + lift_output(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); CHECK(result == correct); } SUBCASE("not matching") { std::vector inputs = { - lift_input1(SumDegree{2}, DiscardCopyDegree{1}, 1, 1, 1), - lift_input2(SumDegree{4}, DiscardCopyDegree{1}, 1, 1, 1), - lift_input3(SumDegree{4}, DiscardCopyDegree{1}, 1, 1, 1), + lift_input1(SumDegree{2_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), + lift_input2(SumDegree{4_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), + lift_input3(SumDegree{4_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), }; std::optional result = @@ -208,27 +221,27 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("discard copy reduction parallelism") { SUBCASE("matching") { - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2}; + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n}; std::vector inputs = { - lift_input1(SumDegree{1}, discard_copy_degree, 1, 1, 1), - lift_input2(SumDegree{1}, discard_copy_degree, 1, 1, 1), - lift_input3(SumDegree{1}, discard_copy_degree, 1, 1, 1), + lift_input1(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n), + lift_input2(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n), + lift_input3(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n), }; tl::expected result = get_output_shape(attrs, inputs); tl::expected correct = - lift_output(SumDegree{1}, discard_copy_degree, 1, 1, 1); + lift_output(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n); CHECK(result == correct); } SUBCASE("not matching") { std::vector inputs = { - lift_input1(SumDegree{1}, DiscardCopyDegree{2}, 1, 1, 1), - lift_input2(SumDegree{1}, DiscardCopyDegree{2}, 1, 1, 1), - lift_input3(SumDegree{1}, DiscardCopyDegree{4}, 1, 1, 1), + lift_input1(SumDegree{1_n}, DiscardCopyDegree{2_n}, 1_n, 1_n, 1_n), + lift_input2(SumDegree{1_n}, DiscardCopyDegree{2_n}, 1_n, 1_n, 1_n), + lift_input3(SumDegree{1_n}, DiscardCopyDegree{4_n}, 1_n, 1_n, 1_n), }; std::optional result = @@ -241,12 +254,15 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("parallelism in axis dim") { SUBCASE("matching") { - int degree = 2; + nonnegative_int degree = 2_n; std::vector inputs = { - lift_input1(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1), - lift_input2(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1), - lift_input3(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1), + lift_input1( + SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n), + lift_input2( + SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n), + lift_input3( + SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n), }; std::optional result = @@ -258,9 +274,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("not matching") { std::vector inputs = { - lift_input1(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, 1), - lift_input2(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, 1), - lift_input3(SumDegree{1}, DiscardCopyDegree{1}, 1, 2, 1), + lift_input1(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), + lift_input2(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), + lift_input3(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 2_n, 1_n), }; std::optional result = @@ -273,31 +289,31 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("parallelism in non-axis shard dims") { SUBCASE("matching") { - int degree0 = 2; - int degree2 = 4; + nonnegative_int degree0 = 2_n; + nonnegative_int degree2 = 4_n; std::vector inputs = { lift_input1( - SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2), + SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2), lift_input2( - SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2), + SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2), lift_input3( - SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2), + SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2), }; tl::expected result = get_output_shape(attrs, inputs); tl::expected correct = lift_output( - SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2); + SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2); CHECK(result == correct); } SUBCASE("not matching") { std::vector inputs = { - lift_input1(SumDegree{1}, DiscardCopyDegree{1}, 2, 1, 4), - lift_input2(SumDegree{1}, DiscardCopyDegree{1}, 4, 1, 2), - lift_input3(SumDegree{1}, DiscardCopyDegree{1}, 4, 1, 2), + lift_input1(SumDegree{1_n}, DiscardCopyDegree{1_n}, 2_n, 1_n, 4_n), + lift_input2(SumDegree{1_n}, DiscardCopyDegree{1_n}, 4_n, 1_n, 2_n), + lift_input3(SumDegree{1_n}, DiscardCopyDegree{1_n}, 4_n, 1_n, 2_n), }; std::optional result = @@ -309,21 +325,21 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("parallelism degrees are not mutually exclusive") { - SumDegree sum_degree = SumDegree{3}; - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{5}; - int degree0 = 2; - int degree2 = 4; + SumDegree sum_degree = SumDegree{3_n}; + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{5_n}; + nonnegative_int degree0 = 2_n; + nonnegative_int degree2 = 4_n; std::vector inputs = { - lift_input1(sum_degree, discard_copy_degree, degree0, 1, degree2), - lift_input2(sum_degree, discard_copy_degree, degree0, 1, degree2), - lift_input3(sum_degree, discard_copy_degree, degree0, 1, degree2), + lift_input1(sum_degree, discard_copy_degree, degree0, 1_n, degree2), + lift_input2(sum_degree, discard_copy_degree, degree0, 1_n, degree2), + lift_input3(sum_degree, discard_copy_degree, degree0, 1_n, degree2), }; tl::expected result = get_output_shape(attrs, inputs); tl::expected correct = - lift_output(sum_degree, discard_copy_degree, degree0, 1, degree2); + lift_output(sum_degree, discard_copy_degree, degree0, 1_n, degree2); CHECK(result == correct); } diff --git a/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc index 7abb98f3e3..f5006d4352 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc @@ -7,14 +7,14 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_conv2d_incoming_tensor_roles(Conv2DAttrs") { auto make_attrs = [](bool use_bias) { - return Conv2DAttrs{/*out_channels=*/4, - /*kernel_h=*/3, - /*kernel_w=*/2, - /*stride_h=*/2, - /*stride_w=*/2, - /*padding_h=*/1, - /*padding_w=*/1, - /*groups=*/1, + return Conv2DAttrs{/*out_channels=*/4_n, + /*kernel_h=*/3_n, + /*kernel_w=*/2_n, + /*stride_h=*/2_n, + /*stride_w=*/2_n, + /*padding_h=*/1_n, + /*padding_w=*/1_n, + /*groups=*/1_n, /*activation=*/std::nullopt, /*use_bias=*/use_bias}; }; @@ -48,14 +48,14 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("Conv2D shape inference") { - int out_channels = 4; - int kernel_h = 3; - int kernel_w = 2; - int stride_h = 2; - int stride_w = 2; - int padding_h = 1; - int padding_w = 1; - int groups = 1; + nonnegative_int out_channels = 4_n; + nonnegative_int kernel_h = 3_n; + nonnegative_int kernel_w = 2_n; + nonnegative_int stride_h = 2_n; + nonnegative_int stride_w = 2_n; + nonnegative_int padding_h = 1_n; + nonnegative_int padding_w = 1_n; + nonnegative_int groups = 1_n; std::optional activation = std::nullopt; bool use_bias = true; @@ -72,13 +72,13 @@ TEST_SUITE(FF_TEST_SUITE) { /*use_bias=*/true, }; - size_t num_samples = 7; - size_t input_channels = 4; - size_t input_height = 11; - size_t input_width = 15; + nonnegative_int num_samples = 7_n; + nonnegative_int input_channels = 4_n; + nonnegative_int input_height = 11_n; + nonnegative_int input_width = 15_n; TensorShape input = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ num_samples, input_channels, input_height, @@ -87,13 +87,13 @@ TEST_SUITE(FF_TEST_SUITE) { DataType::FLOAT, }; - size_t output_height = 6; - size_t output_width = 8; + nonnegative_int output_height = 6_n; + nonnegative_int output_width = 8_n; TensorShape output = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ num_samples, - size_t_from_int(out_channels), + out_channels, output_height, output_width, }}, @@ -101,18 +101,18 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorShape kernel = TensorShape{ - TensorDims{FFOrdered{ - size_t_from_int(out_channels), + TensorDims{FFOrdered{ + out_channels, input_channels, - size_t_from_int(kernel_h), - size_t_from_int(kernel_w), + kernel_h, + kernel_w, }}, DataType::FLOAT, }; TensorShape bias = TensorShape{ - TensorDims{FFOrdered{ - size_t_from_int(out_channels), + TensorDims{FFOrdered{ + out_channels, }}, DataType::FLOAT, }; @@ -137,147 +137,149 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_input = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_n, - int o_c, - int o_h, - int o_w) { + nonnegative_int o_n, + nonnegative_int o_c, + nonnegative_int o_h, + nonnegative_int o_w) { return lift_to_parallel_with_degrees( - input, o_sum, o_eq, FFOrdered{o_n, o_c, o_h, o_w}); + input, o_sum, o_eq, FFOrdered{o_n, o_c, o_h, o_w}); }; auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_n, - int o_c, - int o_h, - int o_w) { + nonnegative_int o_n, + nonnegative_int o_c, + nonnegative_int o_h, + nonnegative_int o_w) { return lift_to_parallel_with_degrees( - output, o_sum, o_eq, FFOrdered{o_n, o_c, o_h, o_w}); + output, o_sum, o_eq, FFOrdered{o_n, o_c, o_h, o_w}); }; auto make_kernel = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_outchannels, - int o_inchannels, - int o_kernel_h, - int o_kernel_w) { + nonnegative_int o_outchannels, + nonnegative_int o_inchannels, + nonnegative_int o_kernel_h, + nonnegative_int o_kernel_w) { return lift_to_parallel_with_degrees( kernel, o_sum, o_eq, - FFOrdered{o_outchannels, o_inchannels, o_kernel_h, o_kernel_w}); + FFOrdered{ + o_outchannels, o_inchannels, o_kernel_h, o_kernel_w}); }; - auto make_bias = - [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o_outchannels) { - return lift_to_parallel_with_degrees( - bias, o_sum, o_eq, FFOrdered{o_outchannels}); - }; + auto make_bias = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o_outchannels) { + return lift_to_parallel_with_degrees( + bias, o_sum, o_eq, FFOrdered{o_outchannels}); + }; SUBCASE("data parallelism") { - int degree = 2; - ParallelTensorShape par_input = - make_input(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1, 1); + nonnegative_int degree = 2_n; + ParallelTensorShape par_input = make_input( + SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n, 1_n); SUBCASE("get_output_shape") { ParallelTensorShape result = get_output_shape(attrs, par_input); - ParallelTensorShape correct = - make_output(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1, 1); + ParallelTensorShape correct = make_output( + SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n, 1_n); CHECK(result == correct); } SUBCASE("get_kernel_shape") { ParallelTensorShape result = get_kernel_shape(attrs, par_input); - ParallelTensorShape correct = - make_kernel(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1, 1); + ParallelTensorShape correct = make_kernel( + SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n, 1_n); CHECK(result == correct); } SUBCASE("get_bias_shape") { ParallelTensorShape result = get_bias_shape(attrs, par_input); ParallelTensorShape correct = - make_bias(SumDegree{1}, DiscardCopyDegree{degree}, 1); + make_bias(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n); CHECK(result == correct); } } SUBCASE("input channel parallelism") { - int degree = 2; - ParallelTensorShape par_input = - make_input(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1, 1); + nonnegative_int degree = 2_n; + ParallelTensorShape par_input = make_input( + SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n, 1_n); SUBCASE("get_output_shape") { ParallelTensorShape result = get_output_shape(attrs, par_input); - ParallelTensorShape correct = - make_output(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1, 1); + ParallelTensorShape correct = make_output( + SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n); CHECK(result == correct); } SUBCASE("get_kernel_shape") { ParallelTensorShape result = get_kernel_shape(attrs, par_input); - ParallelTensorShape correct = - make_kernel(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1, 1); + ParallelTensorShape correct = make_kernel( + SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n, 1_n); CHECK(result == correct); } SUBCASE("get_bias_shape") { ParallelTensorShape result = get_bias_shape(attrs, par_input); ParallelTensorShape correct = - make_bias(SumDegree{degree}, DiscardCopyDegree{1}, 1); + make_bias(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n); CHECK(result == correct); } } SUBCASE("output channel parallelism") { - int degree = 2; - ParallelTensorShape par_input = - make_input(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1, 1); + nonnegative_int degree = 2_n; + ParallelTensorShape par_input = make_input( + SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n, 1_n); SUBCASE("get_output_shape") { ParallelTensorShape result = get_output_shape(attrs, par_input); - ParallelTensorShape correct = - make_output(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1, 1); + ParallelTensorShape correct = make_output( + SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n, 1_n); CHECK(result == correct); } SUBCASE("get_kernel_shape") { ParallelTensorShape result = get_kernel_shape(attrs, par_input); - ParallelTensorShape correct = - make_kernel(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1, 1); + ParallelTensorShape correct = make_kernel( + SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n, 1_n); CHECK(result == correct); } SUBCASE("get_bias_shape") { ParallelTensorShape result = get_bias_shape(attrs, par_input); ParallelTensorShape correct = - make_bias(SumDegree{1}, DiscardCopyDegree{1}, degree); + make_bias(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree); CHECK(result == correct); } } SUBCASE("propagating sum degree") { - int degree = 2; - ParallelTensorShape par_input = - make_input(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1, 1); + nonnegative_int degree = 2_n; + ParallelTensorShape par_input = make_input( + SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n); SUBCASE("get_output_shape") { ParallelTensorShape result = get_output_shape(attrs, par_input); - ParallelTensorShape correct = - make_output(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1, 1); + ParallelTensorShape correct = make_output( + SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n); CHECK(result == correct); } SUBCASE("get_kernel_shape") { ParallelTensorShape result = get_kernel_shape(attrs, par_input); - ParallelTensorShape correct = - make_kernel(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1, 1); + ParallelTensorShape correct = make_kernel( + SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n, 1_n); CHECK(result == correct); } SUBCASE("get_bias_shape") { ParallelTensorShape result = get_bias_shape(attrs, par_input); ParallelTensorShape correct = - make_bias(SumDegree{degree}, DiscardCopyDegree{1}, 1); + make_bias(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n); CHECK(result == correct); } } diff --git a/lib/op-attrs/test/src/op-attrs/ops/dropout.cc b/lib/op-attrs/test/src/op-attrs/ops/dropout.cc index 7580de24e5..e1a03a7613 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/dropout.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/dropout.cc @@ -15,10 +15,10 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorShape input = TensorShape{ - TensorDims{FFOrdered{ - 12, - 14, - 16, + TensorDims{FFOrdered{ + 12_n, + 14_n, + 16_n, }}, DataType::FLOAT, }; @@ -36,48 +36,54 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorShape input = TensorShape{ - TensorDims{FFOrdered{ - 12, - 14, - 16, + TensorDims{FFOrdered{ + 12_n, + 14_n, + 16_n, }}, DataType::FLOAT, }; TensorShape output = input; - auto make_input = - [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) { - return lift_to_parallel_with_degrees( - input, o_sum, o_eq, FFOrdered{o0, o1, o2}); - }; + auto make_input = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o0, + nonnegative_int o1, + nonnegative_int o2) { + return lift_to_parallel_with_degrees( + input, o_sum, o_eq, FFOrdered{o0, o1, o2}); + }; - auto make_output = - [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) { - return lift_to_parallel_with_degrees( - output, o_sum, o_eq, FFOrdered{o0, o1, o2}); - }; + auto make_output = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o0, + nonnegative_int o1, + nonnegative_int o2) { + return lift_to_parallel_with_degrees( + output, o_sum, o_eq, FFOrdered{o0, o1, o2}); + }; SUBCASE("partition parallelism (allowed)") { - int degree0 = 2; - int degree2 = 4; + nonnegative_int degree0 = 2_n; + nonnegative_int degree2 = 4_n; - ParallelTensorShape par_input = - make_input(SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2); + ParallelTensorShape par_input = make_input( + SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2); tl::expected result = get_output_shape(attrs, par_input); - tl::expected correct = - make_output(SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2); + tl::expected correct = make_output( + SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2); CHECK(result == correct); } SUBCASE("sum parallelism (not allowed)") { - SumDegree sum_degree = SumDegree{2}; + SumDegree sum_degree = SumDegree{2_n}; ParallelTensorShape par_input = - make_input(sum_degree, DiscardCopyDegree{1}, 1, 1, 1); + make_input(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); std::optional result = optional_from_expected(get_output_shape(attrs, par_input)); @@ -87,10 +93,10 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("discard copy parallelism (not allowed)") { - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2}; + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n}; ParallelTensorShape par_input = - make_input(SumDegree{1}, discard_copy_degree, 1, 1, 1); + make_input(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n); std::optional result = optional_from_expected(get_output_shape(attrs, par_input)); diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc index d5aab55cb2..d6a92036f0 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc @@ -7,9 +7,9 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("EWAdd shape inference") { - size_t d1 = 16; - size_t d2 = 32; - size_t d3 = 24; + nonnegative_int d1 = 16_n; + nonnegative_int d2 = 32_n; + nonnegative_int d3 = 24_n; ElementBinaryAttrs attrs = ElementBinaryAttrs{ OperatorType::EW_ADD, @@ -20,7 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_lhs = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ d1, d2, d3, @@ -41,7 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("mismatched dim size") { TensorShape incorrect_rhs = input_lhs; - dim_at_idx(incorrect_rhs, relative_ff_dim_t{0}) += 1; + dim_at_idx(incorrect_rhs, relative_ff_dim_t{0}) += 1_n; tl::expected result = get_output_shape(attrs, input_lhs, incorrect_rhs); @@ -53,9 +53,9 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("EWAdd parallel shape inference") { - size_t d1 = 16; - size_t d2 = 32; - size_t d3 = 24; + nonnegative_int d1 = 16_n; + nonnegative_int d2 = 32_n; + nonnegative_int d3 = 24_n; ElementBinaryAttrs attrs = ElementBinaryAttrs{ OperatorType::EW_ADD, @@ -66,7 +66,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape unpar_lhs = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ d1, d2, d3, @@ -83,68 +83,68 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_lhs = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_1, - int o_2, - int o_3) { + nonnegative_int o_1, + nonnegative_int o_2, + nonnegative_int o_3) { return lift_to_parallel_with_degrees( - unpar_lhs, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); + unpar_lhs, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); }; auto make_rhs = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_1, - int o_2, - int o_3) { + nonnegative_int o_1, + nonnegative_int o_2, + nonnegative_int o_3) { return lift_to_parallel_with_degrees( - unpar_rhs, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); + unpar_rhs, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); }; auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_1, - int o_2, - int o_3) { + nonnegative_int o_1, + nonnegative_int o_2, + nonnegative_int o_3) { return lift_to_parallel_with_degrees( - unpar_output, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); + unpar_output, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); }; SUBCASE("data parallelism") { - int degree = 4; + nonnegative_int degree = 4_n; ParallelTensorShape input_lhs = - make_lhs(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1); + make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n); ParallelTensorShape input_rhs = - make_rhs(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1); + make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n); tl::expected result = get_output_shape(attrs, input_lhs, input_rhs); tl::expected correct = - make_output(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1); + make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n); CHECK(result == correct); } SUBCASE("reduction parallelism") { - int degree = 4; + nonnegative_int degree = 4_n; ParallelTensorShape input_lhs = - make_lhs(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1); + make_lhs(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); ParallelTensorShape input_rhs = - make_rhs(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1); + make_rhs(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); tl::expected result = get_output_shape(attrs, input_lhs, input_rhs); tl::expected correct = - make_output(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1); + make_output(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); CHECK(result == correct); } SUBCASE("invalid discard copy parallelism") { - int degree = 4; + nonnegative_int degree = 4_n; ParallelTensorShape input_lhs = - make_lhs(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1); + make_lhs(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n); ParallelTensorShape input_rhs = - make_rhs(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1); + make_rhs(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n); tl::expected result = get_output_shape(attrs, input_lhs, input_rhs); @@ -154,12 +154,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("invalid mismatched parallelism degrees") { - int degree = 4; + nonnegative_int degree = 4_n; ParallelTensorShape input_lhs = - make_lhs(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1); + make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n); ParallelTensorShape input_rhs = - make_rhs(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, degree); + make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, degree); tl::expected result = get_output_shape(attrs, input_lhs, input_rhs); diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc index 94c382356e..bac6efba3f 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc @@ -7,16 +7,16 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ReLU shape inference") { - size_t d1 = 16; - size_t d2 = 32; - size_t d3 = 24; + nonnegative_int d1 = 16_n; + nonnegative_int d2 = 32_n; + nonnegative_int d3 = 24_n; ElementUnaryAttrs attrs = ElementUnaryAttrs{OperatorType::RELU, std::nullopt}; TensorShape input = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ d1, d2, d3, @@ -31,20 +31,20 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); - auto make_i = [&](SumDegree o_sum, - DiscardCopyDegree o_eq, - int o_1, - int o_2, - int o_3) { + auto make_input = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o_1, + nonnegative_int o_2, + nonnegative_int o_3) { return lift_to_parallel_with_degrees( - input, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); + input, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); }; SUBCASE("partition i.e., sharding parallelism") { - int degree1 = 4; - int degree2 = 8; - ParallelTensorShape par_input = - make_i(SumDegree{1}, DiscardCopyDegree{1}, degree1, 1, degree2); + nonnegative_int degree1 = 4_n; + nonnegative_int degree2 = 8_n; + ParallelTensorShape par_input = make_input( + SumDegree{1_n}, DiscardCopyDegree{1_n}, degree1, 1_n, degree2); tl::expected result = get_output_shape(attrs, par_input); @@ -54,10 +54,11 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("sum degree > 1") { - int degree = 2; + nonnegative_int degree = 2_n; tl::expected result = get_output_shape( - attrs, make_i(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1)); + attrs, + make_input(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n)); CHECK_MESSAGE(!result.has_value(), "Unexpected successful result: ", @@ -65,10 +66,11 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("discard copy degree > 1") { - int degree = 2; + nonnegative_int degree = 2_n; tl::expected result = get_output_shape( - attrs, make_i(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1)); + attrs, + make_input(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n)); CHECK_MESSAGE(!result.has_value(), "Unexpected successful result: ", diff --git a/lib/op-attrs/test/src/op-attrs/ops/embedding.cc b/lib/op-attrs/test/src/op-attrs/ops/embedding.cc index 134737f6c0..8fe50a4217 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/embedding.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/embedding.cc @@ -8,8 +8,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Sum embedding shape inference") { - int out_channels = 128; - int num_entries = 1024; + nonnegative_int out_channels = 128_n; + nonnegative_int num_entries = 1024_n; EmbeddingAttrs attrs = EmbeddingAttrs{ /*num_entries=*/num_entries, /*out_channels=*/out_channels, @@ -17,11 +17,11 @@ TEST_SUITE(FF_TEST_SUITE) { /*data_type=*/DataType::FLOAT, }; - size_t batch_size = 48; - size_t features_dim = 56; + nonnegative_int batch_size = 48_n; + nonnegative_int features_dim = 56_n; TensorShape input = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ batch_size, features_dim, }}, @@ -30,9 +30,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, - size_t_from_int(out_channels), + out_channels, }, }, DataType::FLOAT, @@ -40,9 +40,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape weights = TensorShape{ TensorDims{ - FFOrdered{ - size_t_from_int(num_entries), - size_t_from_int(out_channels), + FFOrdered{ + num_entries, + out_channels, }, }, DataType::FLOAT, @@ -66,38 +66,44 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_input = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_batch, - int o_features) { + nonnegative_int o_batch, + nonnegative_int o_features) { return lift_to_parallel_with_degrees( - input, o_sum, o_eq, FFOrdered{o_batch, o_features}); + input, o_sum, o_eq, FFOrdered{o_batch, o_features}); }; auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_batch, - int o_outchannels) { + nonnegative_int o_batch, + nonnegative_int o_outchannels) { return lift_to_parallel_with_degrees( - output, o_sum, o_eq, FFOrdered{o_batch, o_outchannels}); + output, + o_sum, + o_eq, + FFOrdered{o_batch, o_outchannels}); }; auto make_weights = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_entries, - int o_outchannels) { + nonnegative_int o_entries, + nonnegative_int o_outchannels) { return lift_to_parallel_with_degrees( - weights, o_sum, o_eq, FFOrdered{o_entries, o_outchannels}); + weights, + o_sum, + o_eq, + FFOrdered{o_entries, o_outchannels}); }; SUBCASE("data parallelism") { - int degree = 4; + nonnegative_int degree = 4_n; ParallelTensorShape par_input = - make_input(SumDegree{1}, DiscardCopyDegree{1}, degree, 1); + make_input(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n); { tl::expected result = get_output_shape(attrs, par_input); tl::expected correct = - make_output(SumDegree{1}, DiscardCopyDegree{1}, degree, 1); + make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n); CHECK(result == correct); } @@ -105,21 +111,21 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_weights_shape(attrs, par_input); tl::expected correct = - make_weights(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1); + make_weights(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n); CHECK(result == correct); } } SUBCASE("input features parallelism") { - int degree = 4; + nonnegative_int degree = 4_n; ParallelTensorShape input = - make_input(SumDegree{1}, DiscardCopyDegree{1}, 1, degree); + make_input(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree); { tl::expected result = get_output_shape(attrs, input); tl::expected correct = - make_output(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1); + make_output(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n); CHECK(result == correct); } @@ -127,7 +133,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_weights_shape(attrs, input); tl::expected correct = - make_weights(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1); + make_weights(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n); CHECK(result == correct); } } @@ -139,15 +145,15 @@ TEST_SUITE(FF_TEST_SUITE) { // dimension. For now we choose to represent parallelism in the channel // dimension, but partitioning in the entry dimension is also potentially // useful as it produces sum parallelism in the output - int degree = 4; + nonnegative_int degree = 4_n; ParallelTensorShape input = - make_input(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1); + make_input(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n); { tl::expected result = get_output_shape(attrs, input); tl::expected correct = - make_output(SumDegree{1}, DiscardCopyDegree{1}, 1, degree); + make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree); CHECK(result == correct); } @@ -155,7 +161,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_weights_shape(attrs, input); tl::expected correct = - make_weights(SumDegree{1}, DiscardCopyDegree{1}, 1, degree); + make_weights(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree); CHECK(result == correct); } } diff --git a/lib/op-attrs/test/src/op-attrs/ops/flat.cc b/lib/op-attrs/test/src/op-attrs/ops/flat.cc index 8998dfaffd..ebd869b3e5 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/flat.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/flat.cc @@ -9,25 +9,25 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_output_shape(FlatAttrs, TensorShape)") { TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 2, - 4, - 2, - 3, + TensorDims{FFOrdered{ + 2_n, + 4_n, + 2_n, + 3_n, }}, DataType::FLOAT, }; SUBCASE("flatten all dims") { FlatAttrs attrs = FlatAttrs{ - /*start_dim=*/ff_dim_t{nonnegative_int{0}}, - /*end_dim=*/ff_dim_t{nonnegative_int{4}}, + /*start_dim=*/ff_dim_t{0_n}, + /*end_dim=*/ff_dim_t{4_n}, }; TensorShape result = get_output_shape(attrs, input_shape); TensorShape correct = TensorShape{ - TensorDims{FFOrdered{ - 2 * 4 * 2 * 3, + TensorDims{FFOrdered{ + 2_n * 4_n * 2_n * 3_n, }}, DataType::FLOAT, }; @@ -43,10 +43,10 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape result = get_output_shape(attrs, input_shape); TensorShape correct = TensorShape{ - TensorDims{FFOrdered{ - 2, - 4, - 2 * 3, + TensorDims{FFOrdered{ + 2_n, + 4_n, + 2_n * 3_n, }}, DataType::FLOAT, }; @@ -62,10 +62,10 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape result = get_output_shape(attrs, input_shape); TensorShape correct = TensorShape{ - TensorDims{FFOrdered{ - 2 * 4, - 2, - 3, + TensorDims{FFOrdered{ + 2_n * 4_n, + 2_n, + 3_n, }}, DataType::FLOAT, }; @@ -81,10 +81,10 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape result = get_output_shape(attrs, input_shape); TensorShape correct = TensorShape{ - TensorDims{FFOrdered{ - 2, - 4 * 2, - 3, + TensorDims{FFOrdered{ + 2_n, + 4_n * 2_n, + 3_n, }}, DataType::FLOAT, }; @@ -124,18 +124,18 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("allows shard parallelism in non-flattened dims") { ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1}, - DiscardCopyDegree{1}, - FFOrdered{2, 1, 1, 3}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, + FFOrdered{2_n, 1_n, 1_n, 3_n}, }; tl::expected result = get_output_parallel_dim_degrees(attrs, input); tl::expected correct = ParallelTensorDimDegrees{ - SumDegree{1}, - DiscardCopyDegree{1}, - FFOrdered{2, 1, 3}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, + FFOrdered{2_n, 1_n, 3_n}, }; CHECK(result == correct); @@ -143,9 +143,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("does not allow shard parallelism in flattened dims") { ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1}, - DiscardCopyDegree{1}, - FFOrdered{1, 1, 2, 1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, + FFOrdered{1_n, 1_n, 2_n, 1_n}, }; std::optional result = @@ -157,18 +157,18 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("allows sum parallelism") { ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{2}, - DiscardCopyDegree{1}, - FFOrdered{1, 1, 1, 1}, + SumDegree{2_n}, + DiscardCopyDegree{1_n}, + FFOrdered{1_n, 1_n, 1_n, 1_n}, }; std::optional result = optional_from_expected(get_output_parallel_dim_degrees(attrs, input)); std::optional correct = ParallelTensorDimDegrees{ - SumDegree{2}, - DiscardCopyDegree{1}, - FFOrdered{1, 1, 1}, + SumDegree{2_n}, + DiscardCopyDegree{1_n}, + FFOrdered{1_n, 1_n, 1_n}, }; CHECK(result == correct); @@ -176,18 +176,18 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("allows discard copy parallelism") { ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1}, - DiscardCopyDegree{2}, - FFOrdered{1, 1, 1, 1}, + SumDegree{1_n}, + DiscardCopyDegree{2_n}, + FFOrdered{1_n, 1_n, 1_n, 1_n}, }; std::optional result = optional_from_expected(get_output_parallel_dim_degrees(attrs, input)); std::optional correct = ParallelTensorDimDegrees{ - SumDegree{1}, - DiscardCopyDegree{2}, - FFOrdered{1, 1, 1}, + SumDegree{1_n}, + DiscardCopyDegree{2_n}, + FFOrdered{1_n, 1_n, 1_n}, }; CHECK(result == correct); @@ -203,22 +203,22 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{4, 2}, - ShardParallelDim{8, 1}, - ShardParallelDim{6, 1}, - ShardParallelDim{9, 3}, + ShardParallelDim{4_n, 2_n}, + ShardParallelDim{8_n, 1_n}, + ShardParallelDim{6_n, 1_n}, + ShardParallelDim{9_n, 3_n}, }, ReplicaParallelDimSet{ - SumDegree{7}, - DiscardCopyDegree{5}, + SumDegree{7_n}, + DiscardCopyDegree{5_n}, }, }, DataType::FLOAT, }; FlatAttrs attrs = FlatAttrs{ - /*start_dim=*/ff_dim_t{nonnegative_int{1}}, - /*end_dim=*/ff_dim_t{nonnegative_int{3}}, + /*start_dim=*/ff_dim_t{nonnegative_int{1_n}}, + /*end_dim=*/ff_dim_t{nonnegative_int{3_n}}, }; tl::expected result = @@ -227,13 +227,13 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{4, 2}, - ShardParallelDim{8 * 6, 1}, - ShardParallelDim{9, 3}, + ShardParallelDim{4_n, 2_n}, + ShardParallelDim{8_n * 6_n, 1_n}, + ShardParallelDim{9_n, 3_n}, }, ReplicaParallelDimSet{ - SumDegree{7}, - DiscardCopyDegree{5}, + SumDegree{7_n}, + DiscardCopyDegree{5_n}, }, }, DataType::FLOAT, diff --git a/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc index b9426a89a2..b9aa3c0677 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc @@ -58,11 +58,11 @@ TEST_SUITE(FF_TEST_SUITE) { }(); TensorShape input = TensorShape{ - TensorDims{FFOrdered{ - 12, - 14, - 16, - 18, + TensorDims{FFOrdered{ + 12_n, + 14_n, + 16_n, + 18_n, }}, DataType::FLOAT, }; @@ -70,9 +70,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output = input; TensorShape gamma = TensorShape{ - TensorDims{FFOrdered{ - 12, - 16, + TensorDims{FFOrdered{ + 12_n, + 16_n, }}, DataType::FLOAT, }; @@ -125,49 +125,58 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_input = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o0, - int o1, - int o2, - int o3) { + nonnegative_int o0, + nonnegative_int o1, + nonnegative_int o2, + nonnegative_int o3) { return lift_to_parallel_with_degrees( - input, o_sum, o_eq, FFOrdered{o0, o1, o2, o3}); + input, o_sum, o_eq, FFOrdered{o0, o1, o2, o3}); }; auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o0, - int o1, - int o2, - int o3) { + nonnegative_int o0, + nonnegative_int o1, + nonnegative_int o2, + nonnegative_int o3) { return lift_to_parallel_with_degrees( - output, o_sum, o_eq, FFOrdered{o0, o1, o2, o3}); + output, o_sum, o_eq, FFOrdered{o0, o1, o2, o3}); }; - auto make_gamma_weights = - [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o2) { - return lift_to_parallel_with_degrees( - gamma, o_sum, o_eq, FFOrdered{o0, o2}); - }; + auto make_gamma_weights = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o0, + nonnegative_int o2) { + return lift_to_parallel_with_degrees( + gamma, o_sum, o_eq, FFOrdered{o0, o2}); + }; - auto make_beta_weights = - [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o2) { - return lift_to_parallel_with_degrees( - beta, o_sum, o_eq, FFOrdered{o0, o2}); - }; + auto make_beta_weights = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o0, + nonnegative_int o2) { + return lift_to_parallel_with_degrees( + beta, o_sum, o_eq, FFOrdered{o0, o2}); + }; SUBCASE("parallel shape inference (LayerNorm)") { SUBCASE("partition parallelism (not in axes)") { - int degree0 = 2; - int degree2 = 3; + nonnegative_int degree0 = 2_n; + nonnegative_int degree2 = 3_n; ParallelTensorShape par_input = make_input( - SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2, 1); + SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2, 1_n); SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") { tl::expected result = get_output_shape(attrs_affine_true, par_input); - tl::expected correct = make_output( - SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2, 1); + tl::expected correct = + make_output(SumDegree{1_n}, + DiscardCopyDegree{1_n}, + degree0, + 1_n, + degree2, + 1_n); CHECK(result == correct); } @@ -179,7 +188,7 @@ TEST_SUITE(FF_TEST_SUITE) { get_gamma_weights_shape(attrs_affine_true, par_input); tl::expected correct = make_gamma_weights( - SumDegree{1}, DiscardCopyDegree{1}, degree0, degree2); + SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, degree2); CHECK(result == correct); } @@ -199,7 +208,7 @@ TEST_SUITE(FF_TEST_SUITE) { get_beta_weights_shape(attrs_affine_true, par_input); tl::expected correct = make_beta_weights( - SumDegree{1}, DiscardCopyDegree{1}, degree0, degree2); + SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, degree2); CHECK(result == correct); } @@ -215,11 +224,11 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("partition parallelism (in axes)") { - int degree1 = 2; - int degree2 = 4; + nonnegative_int degree1 = 2_n; + nonnegative_int degree2 = 4_n; ParallelTensorShape par_input = make_input( - SumDegree{1}, DiscardCopyDegree{1}, 1, degree1, degree2, 1); + SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree1, degree2, 1_n); SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") { std::optional result = optional_from_expected( @@ -248,10 +257,10 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("sum parallelism") { - SumDegree sum_degree = SumDegree{2}; + SumDegree sum_degree = SumDegree{2_n}; ParallelTensorShape par_input = - make_input(sum_degree, DiscardCopyDegree{1}, 1, 1, 1, 1); + make_input(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n); SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") { std::optional result = optional_from_expected( @@ -280,10 +289,10 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("discard copy parallelism") { - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2}; + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n}; ParallelTensorShape par_input = - make_input(SumDegree{1}, discard_copy_degree, 1, 1, 1, 1); + make_input(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n, 1_n); SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") { std::optional result = optional_from_expected( diff --git a/lib/op-attrs/test/src/op-attrs/ops/linear.cc b/lib/op-attrs/test/src/op-attrs/ops/linear.cc index 191515b062..eaa99ef099 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/linear.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/linear.cc @@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_linear_incoming_tensor_roles(LinearAttrs)") { auto make_attrs = [](bool use_bias) { return LinearAttrs{ - /*out_channels=*/16, + /*out_channels=*/16_n, /*use_bias=*/use_bias, /*data_type=*/DataType::FLOAT, /*activation=*/Activation::RELU, @@ -47,7 +47,7 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("Linear shape inference") { - int out_channels = 16; + nonnegative_int out_channels = 16_n; LinearAttrs attrs = LinearAttrs{ /*out_channels=*/out_channels, /*use_bias=*/true, @@ -56,13 +56,13 @@ TEST_SUITE(FF_TEST_SUITE) { /*regularizer=*/std::nullopt, }; - size_t batch_size = 12; - size_t extra_dim = 16; - size_t in_channels = 8; + nonnegative_int batch_size = 12_n; + nonnegative_int extra_dim = 16_n; + nonnegative_int in_channels = 8_n; TensorShape input = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, extra_dim, in_channels, @@ -73,10 +73,10 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, extra_dim, - size_t_from_int(out_channels), + out_channels, }, }, DataType::FLOAT, @@ -84,9 +84,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape projection = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ in_channels, - size_t_from_int(out_channels), + out_channels, }, }, DataType::FLOAT, @@ -94,8 +94,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape bias = TensorShape{ TensorDims{ - FFOrdered{ - size_t_from_int(out_channels), + FFOrdered{ + out_channels, }, }, DataType::FLOAT, @@ -127,56 +127,66 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_input = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_batch, - int o_extra_dim, - int o_channel) { + nonnegative_int o_batch, + nonnegative_int o_extra_dim, + nonnegative_int o_channel) { return lift_to_parallel_with_degrees( - input, o_sum, o_eq, FFOrdered{o_batch, o_extra_dim, o_channel}); + input, + o_sum, + o_eq, + FFOrdered{o_batch, o_extra_dim, o_channel}); }; auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_batch, - int o_extra_dim, - int o_channel) { + nonnegative_int o_batch, + nonnegative_int o_extra_dim, + nonnegative_int o_channel) { return lift_to_parallel_with_degrees( - output, o_sum, o_eq, FFOrdered{o_batch, o_extra_dim, o_channel}); + output, + o_sum, + o_eq, + FFOrdered{o_batch, o_extra_dim, o_channel}); }; auto make_projection = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - int o_inchannel, - int o_outchannel) { + nonnegative_int o_inchannel, + nonnegative_int o_outchannel) { return lift_to_parallel_with_degrees( - projection, o_sum, o_eq, FFOrdered{o_inchannel, o_outchannel}); + projection, + o_sum, + o_eq, + FFOrdered{o_inchannel, o_outchannel}); }; - auto make_bias = - [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o_outchannel) { - return lift_to_parallel_with_degrees( - bias, o_sum, o_eq, FFOrdered{o_outchannel}); - }; + auto make_bias = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o_outchannel) { + return lift_to_parallel_with_degrees( + bias, o_sum, o_eq, FFOrdered{o_outchannel}); + }; SUBCASE("data parallelism") { - int input_sum_degree = 2; - int extra_dim_degree = 8; - int degree = 4; + nonnegative_int input_sum_degree = 2_n; + nonnegative_int extra_dim_degree = 8_n; + nonnegative_int degree = 4_n; ParallelTensorShape par_input = make_input(SumDegree{input_sum_degree}, - DiscardCopyDegree{1}, + DiscardCopyDegree{1_n}, degree, extra_dim_degree, - 1); + 1_n); { tl::expected result = get_output_shape(attrs, par_input); tl::expected correct = make_output(SumDegree{input_sum_degree}, - DiscardCopyDegree{1}, + DiscardCopyDegree{1_n}, degree, extra_dim_degree, - 1); + 1_n); CHECK(result == correct); } @@ -185,10 +195,10 @@ TEST_SUITE(FF_TEST_SUITE) { get_projection_shape(attrs, par_input); tl::expected correct = make_projection( - SumDegree{1}, + SumDegree{1_n}, DiscardCopyDegree{input_sum_degree * degree * extra_dim_degree}, - 1, - 1); + 1_n, + 1_n); CHECK(result == correct); } @@ -198,27 +208,30 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected correct = make_bias(SumDegree{input_sum_degree}, DiscardCopyDegree{degree * extra_dim_degree}, - 1); + 1_n); CHECK(result == correct); } } SUBCASE("reduction parallelism") { - int input_sum_degree = 2; - int degree = 4; + nonnegative_int input_sum_degree = 2_n; + nonnegative_int degree = 4_n; - ParallelTensorShape par_input = make_input( - SumDegree{input_sum_degree}, DiscardCopyDegree{1}, 1, 1, degree); + ParallelTensorShape par_input = make_input(SumDegree{input_sum_degree}, + DiscardCopyDegree{1_n}, + 1_n, + 1_n, + degree); { tl::expected result = get_output_shape(attrs, par_input); tl::expected correct = make_output(SumDegree{input_sum_degree * degree}, - DiscardCopyDegree{1}, - 1, - 1, - 1); + DiscardCopyDegree{1_n}, + 1_n, + 1_n, + 1_n); CHECK(result == correct); } @@ -226,8 +239,10 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_projection_shape(attrs, par_input); tl::expected correct = - make_projection( - SumDegree{1}, DiscardCopyDegree{input_sum_degree}, degree, 1); + make_projection(SumDegree{1_n}, + DiscardCopyDegree{input_sum_degree}, + degree, + 1_n); CHECK(result == correct); } @@ -235,23 +250,30 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_bias_shape(attrs, par_input); tl::expected correct = make_bias( - SumDegree{input_sum_degree * degree}, DiscardCopyDegree{1}, 1); + SumDegree{input_sum_degree * degree}, DiscardCopyDegree{1_n}, 1_n); CHECK(result == correct); } } SUBCASE("output channel parallelism") { - int input_sum_degree = 2; - int degree = 4; + nonnegative_int input_sum_degree = 2_n; + nonnegative_int degree = 4_n; - ParallelTensorShape par_input = make_input( - SumDegree{input_sum_degree}, DiscardCopyDegree{degree}, 1, 1, 1); + ParallelTensorShape par_input = make_input(SumDegree{input_sum_degree}, + DiscardCopyDegree{degree}, + 1_n, + 1_n, + 1_n); { tl::expected result = get_output_shape(attrs, par_input); - tl::expected correct = make_output( - SumDegree{input_sum_degree}, DiscardCopyDegree{1}, 1, 1, degree); + tl::expected correct = + make_output(SumDegree{input_sum_degree}, + DiscardCopyDegree{1_n}, + 1_n, + 1_n, + degree); CHECK(result == correct); } @@ -259,8 +281,10 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_projection_shape(attrs, par_input); tl::expected correct = - make_projection( - SumDegree{1}, DiscardCopyDegree{input_sum_degree}, 1, degree); + make_projection(SumDegree{1_n}, + DiscardCopyDegree{input_sum_degree}, + 1_n, + degree); CHECK(result == correct); } @@ -268,7 +292,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_bias_shape(attrs, par_input); tl::expected correct = make_bias( - SumDegree{input_sum_degree}, DiscardCopyDegree{1}, degree); + SumDegree{input_sum_degree}, DiscardCopyDegree{1_n}, degree); CHECK(result == correct); } } diff --git a/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc index 0c14c0fc2a..6c14a226a2 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc @@ -9,27 +9,27 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("make_adaptive_pool2d") { - size_t input_n = 10; - size_t input_c = 11; - size_t input_h = 15; - size_t input_w = 20; + nonnegative_int input_n = 10_n; + nonnegative_int input_c = 11_n; + nonnegative_int input_h = 15_n; + nonnegative_int input_w = 20_n; Activation activation = Activation::RELU; PoolOp op = PoolOp::AVG; - TensorDims input_dims = - TensorDims{FFOrdered{input_n, input_c, input_h, input_w}}; + TensorDims input_dims = TensorDims{ + FFOrdered{input_n, input_c, input_h, input_w}}; SUBCASE("input_h divisible by output_h && input_w divisible by output_w") { - int output_h = 5; - int output_w = 2; + nonnegative_int output_h = 5_n; + nonnegative_int output_w = 2_n; Pool2DAttrs correct_attrs = Pool2DAttrs{ - /*kernel_h=*/3, - /*kernel_w=*/10, - /*stride_h=*/3, - /*stride_w=*/10, - /*padding_h=*/0, - /*padding_w=*/0, + /*kernel_h=*/3_n, + /*kernel_w=*/10_n, + /*stride_h=*/3_n, + /*stride_w=*/10_n, + /*padding_h=*/0_n, + /*padding_w=*/0_n, /*pool_type=*/op, /*activation=*/activation, }; @@ -50,11 +50,11 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_output_shape(correct_attrs, input_shape); tl::expected correct = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ input_n, input_c, - size_t_from_int(output_h), - size_t_from_int(output_w), + output_h, + output_w, }}, DataType::FLOAT, }; @@ -64,8 +64,8 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("input_h not divisible by output_h") { - int output_h = 6; - int output_w = 2; + nonnegative_int output_h = 6_n; + nonnegative_int output_w = 2_n; std::optional result = optional_from_expected(make_adaptive_pool2d_attrs( @@ -76,8 +76,8 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("input_w not divisible by output_w") { - int output_h = 5; - int output_w = 3; + nonnegative_int output_h = 5_n; + nonnegative_int output_w = 3_n; std::optional result = optional_from_expected(make_adaptive_pool2d_attrs( @@ -88,16 +88,16 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("input_h == output_h and input_w == output_w") { - int output_h = input_h; - int output_w = input_w; + nonnegative_int output_h = input_h; + nonnegative_int output_w = input_w; Pool2DAttrs correct_attrs = Pool2DAttrs{ - /*kernel_h=*/1, - /*kernel_w=*/1, - /*stride_h=*/1, - /*stride_w=*/1, - /*padding_h=*/0, - /*padding_w=*/0, + /*kernel_h=*/1_n, + /*kernel_w=*/1_n, + /*stride_h=*/1_n, + /*stride_w=*/1_n, + /*padding_h=*/0_n, + /*padding_w=*/0_n, /*pool_type=*/op, /*activation=*/activation, }; @@ -126,22 +126,22 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_output_shape(Pool2DAttrs, TensorShape)") { Pool2DAttrs attrs = Pool2DAttrs{ - /*kernel_h=*/3, - /*kernel_w=*/2, - /*stride_h=*/2, - /*stride_w=*/2, - /*padding_h=*/1, - /*padding_w=*/1, + /*kernel_h=*/3_n, + /*kernel_w=*/2_n, + /*stride_h=*/2_n, + /*stride_w=*/2_n, + /*padding_h=*/1_n, + /*padding_w=*/1_n, /*pool_type=*/PoolOp::MAX, /*activation=*/std::nullopt, }; SUBCASE("fails on non-4d inputs") { TensorShape input = TensorShape{ - TensorDims{FFOrdered{ - 10, - 12, - 14, + TensorDims{FFOrdered{ + 10_n, + 12_n, + 14_n, }}, DataType::FLOAT, }; @@ -155,14 +155,14 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("4d input") { TensorShape input = TensorShape{ - TensorDims{FFOrdered{11, 13, 12, 6}}, + TensorDims{FFOrdered{11_n, 13_n, 12_n, 6_n}}, DataType::FLOAT, }; tl::expected result = get_output_shape(attrs, input); tl::expected correct = TensorShape{ - TensorDims{FFOrdered{11, 13, 6, 4}}, + TensorDims{FFOrdered{11_n, 13_n, 6_n, 4_n}}, DataType::FLOAT, }; @@ -175,12 +175,12 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_attrs = [](PoolOp pool_type, std::optional const &activation) { return Pool2DAttrs{ - /*kernel_h=*/3, - /*kernel_w=*/2, - /*stride_h=*/2, - /*stride_w=*/2, - /*padding_h=*/1, - /*padding_w=*/1, + /*kernel_h=*/3_n, + /*kernel_w=*/2_n, + /*stride_h=*/2_n, + /*stride_w=*/2_n, + /*padding_h=*/1_n, + /*padding_w=*/1_n, /*pool_type=*/pool_type, /*activation=*/activation, }; @@ -190,13 +190,13 @@ TEST_SUITE(FF_TEST_SUITE) { Pool2DAttrs attrs = make_attrs(PoolOp::MAX, /*activation=*/std::nullopt); ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1}, - DiscardCopyDegree{1}, - FFOrdered{ - 4, - 1, - 1, - 1, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, + FFOrdered{ + 4_n, + 1_n, + 1_n, + 1_n, }, }; @@ -211,13 +211,13 @@ TEST_SUITE(FF_TEST_SUITE) { Pool2DAttrs attrs = make_attrs(PoolOp::MAX, /*activation=*/std::nullopt); ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1}, - DiscardCopyDegree{1}, - FFOrdered{ - 4, - 2, - 5, - 6, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, + FFOrdered{ + 4_n, + 2_n, + 5_n, + 6_n, }, }; @@ -232,13 +232,13 @@ TEST_SUITE(FF_TEST_SUITE) { Pool2DAttrs attrs = make_attrs(PoolOp::MAX, /*activation=*/std::nullopt); ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1}, - DiscardCopyDegree{3}, - FFOrdered{ - 1, - 1, - 1, - 1, + SumDegree{1_n}, + DiscardCopyDegree{3_n}, + FFOrdered{ + 1_n, + 1_n, + 1_n, + 1_n, }, }; @@ -256,13 +256,13 @@ TEST_SUITE(FF_TEST_SUITE) { make_attrs(PoolOp::MAX, /*activation=*/std::nullopt); ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{2}, - DiscardCopyDegree{1}, - FFOrdered{ - 1, - 1, - 1, - 1, + SumDegree{2_n}, + DiscardCopyDegree{1_n}, + FFOrdered{ + 1_n, + 1_n, + 1_n, + 1_n, }, }; @@ -279,13 +279,13 @@ TEST_SUITE(FF_TEST_SUITE) { make_attrs(PoolOp::AVG, /*activation=*/std::nullopt); ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{2}, - DiscardCopyDegree{1}, - FFOrdered{ - 1, - 1, - 1, - 1, + SumDegree{2_n}, + DiscardCopyDegree{1_n}, + FFOrdered{ + 1_n, + 1_n, + 1_n, + 1_n, }, }; @@ -302,13 +302,13 @@ TEST_SUITE(FF_TEST_SUITE) { make_attrs(PoolOp::AVG, /*activation=*/Activation::RELU); ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{2}, - DiscardCopyDegree{1}, - FFOrdered{ - 1, - 1, - 1, - 1, + SumDegree{2_n}, + DiscardCopyDegree{1_n}, + FFOrdered{ + 1_n, + 1_n, + 1_n, + 1_n, }, }; @@ -326,12 +326,12 @@ TEST_SUITE(FF_TEST_SUITE) { // just do a single test to make sure it works/exists Pool2DAttrs attrs = Pool2DAttrs{ - /*kernel_h=*/3, - /*kernel_w=*/2, - /*stride_h=*/2, - /*stride_w=*/2, - /*padding_h=*/1, - /*padding_w=*/1, + /*kernel_h=*/3_n, + /*kernel_w=*/2_n, + /*stride_h=*/2_n, + /*stride_w=*/2_n, + /*padding_h=*/1_n, + /*padding_w=*/1_n, /*pool_type=*/PoolOp::MAX, /*activation=*/std::nullopt, }; @@ -340,14 +340,14 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{14, 7}, - ShardParallelDim{16, 8}, - ShardParallelDim{12, 3}, - ShardParallelDim{6, 2}, + ShardParallelDim{14_n, 7_n}, + ShardParallelDim{16_n, 8_n}, + ShardParallelDim{12_n, 3_n}, + ShardParallelDim{6_n, 2_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{2}, + SumDegree{1_n}, + DiscardCopyDegree{2_n}, }, }, DataType::FLOAT, @@ -359,14 +359,14 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{14, 7}, - ShardParallelDim{16, 8}, - ShardParallelDim{6, 3}, - ShardParallelDim{4, 2}, + ShardParallelDim{14_n, 7_n}, + ShardParallelDim{16_n, 8_n}, + ShardParallelDim{6_n, 3_n}, + ShardParallelDim{4_n, 2_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{2}, + SumDegree{1_n}, + DiscardCopyDegree{2_n}, }, }, DataType::FLOAT, @@ -377,14 +377,14 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{14, 1}, - ShardParallelDim{16, 1}, - ShardParallelDim{12, 1}, - ShardParallelDim{6, 1}, + ShardParallelDim{14_n, 1_n}, + ShardParallelDim{16_n, 1_n}, + ShardParallelDim{12_n, 1_n}, + ShardParallelDim{6_n, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{2}, - DiscardCopyDegree{1}, + SumDegree{2_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, diff --git a/lib/op-attrs/test/src/op-attrs/ops/reduction.cc b/lib/op-attrs/test/src/op-attrs/ops/reduction.cc index 0d1c8bdf98..dc12eb12a8 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/reduction.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/reduction.cc @@ -10,21 +10,21 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{12, 2}, - ShardParallelDim{14, 1}, - ShardParallelDim{16, 3}, - ShardParallelDim{18, 2}, + ShardParallelDim{12_n, 2_n}, + ShardParallelDim{14_n, 1_n}, + ShardParallelDim{16_n, 3_n}, + ShardParallelDim{18_n, 2_n}, }, ReplicaParallelDimSet{ - SumDegree{3}, - DiscardCopyDegree{2}, + SumDegree{3_n}, + DiscardCopyDegree{2_n}, }, }, DataType::FLOAT, }; SUBCASE("valid") { - int degree = 3; + nonnegative_int degree = 3_n; ReductionAttrs attrs = ReductionAttrs{ /*repartition_degree=*/degree, }; @@ -42,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("invalid") { - int degree = 4; + nonnegative_int degree = 4_n; ReductionAttrs attrs = ReductionAttrs{ /*repartition_degree=*/degree, }; diff --git a/lib/op-attrs/test/src/op-attrs/ops/repartition.cc b/lib/op-attrs/test/src/op-attrs/ops/repartition.cc index ba213f54f4..36a265ce9f 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/repartition.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/repartition.cc @@ -6,8 +6,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Repartition shape inference") { - ff_dim_t dim = ff_dim_t{nonnegative_int{2}}; - int degree = 4; + ff_dim_t dim = ff_dim_t{2_n}; + nonnegative_int degree = 4_n; RepartitionAttrs attrs = RepartitionAttrs{ /*repartition_dim=*/dim, /*repartition_degree=*/degree, @@ -16,14 +16,14 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{12, 2}, - ShardParallelDim{14, 1}, - ShardParallelDim{16, 3}, - ShardParallelDim{18, 2}, + ShardParallelDim{12_n, 2_n}, + ShardParallelDim{14_n, 1_n}, + ShardParallelDim{16_n, 3_n}, + ShardParallelDim{18_n, 2_n}, }, ReplicaParallelDimSet{ - SumDegree{3}, - DiscardCopyDegree{2}, + SumDegree{3_n}, + DiscardCopyDegree{2_n}, }, }, DataType::FLOAT, diff --git a/lib/op-attrs/test/src/op-attrs/ops/replicate.cc b/lib/op-attrs/test/src/op-attrs/ops/replicate.cc index 60a1018479..770ae20d38 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/replicate.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/replicate.cc @@ -6,20 +6,20 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Replicate shape inference") { ReplicateAttrs attrs = ReplicateAttrs{ - /*replicate_degree=*/4, + /*replicate_degree=*/4_n, }; ParallelTensorShape input = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{10, 2}, - ShardParallelDim{12, 1}, - ShardParallelDim{14, 2}, - ShardParallelDim{16, 2}, + ShardParallelDim{10_n, 2_n}, + ShardParallelDim{12_n, 1_n}, + ShardParallelDim{14_n, 2_n}, + ShardParallelDim{16_n, 2_n}, }, ReplicaParallelDimSet{ - SumDegree{3}, - DiscardCopyDegree{2}, + SumDegree{3_n}, + DiscardCopyDegree{2_n}, }, }, DataType::FLOAT, @@ -28,7 +28,8 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape result = get_output_shape(attrs, input); ParallelTensorShape correct_output = input; - correct_output.dims.replica_dims.discard_copy_degree = DiscardCopyDegree{8}; + correct_output.dims.replica_dims.discard_copy_degree = + DiscardCopyDegree{8_n}; CHECK(result == correct_output); } diff --git a/lib/op-attrs/test/src/op-attrs/ops/softmax.cc b/lib/op-attrs/test/src/op-attrs/ops/softmax.cc index 5808e5ef42..8c80e348c0 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/softmax.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/softmax.cc @@ -10,16 +10,16 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_output_shape(SoftmaxAttrs, TensorShape)") { TensorShape input = TensorShape{ - TensorDims{FFOrdered{ - 12, - 14, - 16, + TensorDims{FFOrdered{ + 12_n, + 14_n, + 16_n, }}, DataType::FLOAT, }; SUBCASE("attrs.dim in bounds") { - SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}}; + SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}}; tl::expected result = get_output_shape(attrs, input); @@ -29,7 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("attrs.dims out of bounds") { - SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{4}}}; + SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{4_n}}; std::optional result = optional_from_expected(get_output_shape(attrs, input)); @@ -41,47 +41,53 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_output_shape(SoftmaxAttrs, ParallelTensorShape)") { TensorShape input = TensorShape{ - TensorDims{FFOrdered{ - 12, - 14, - 16, + TensorDims{FFOrdered{ + 12_n, + 14_n, + 16_n, }}, DataType::FLOAT, }; TensorShape output = input; - auto make_input = - [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) { - return lift_to_parallel_with_degrees( - input, o_sum, o_eq, FFOrdered{o0, o1, o2}); - }; + auto make_input = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o0, + nonnegative_int o1, + nonnegative_int o2) { + return lift_to_parallel_with_degrees( + input, o_sum, o_eq, FFOrdered{o0, o1, o2}); + }; - auto make_output = - [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) { - return lift_to_parallel_with_degrees( - output, o_sum, o_eq, FFOrdered{o0, o1, o2}); - }; + auto make_output = [&](SumDegree o_sum, + DiscardCopyDegree o_eq, + nonnegative_int o0, + nonnegative_int o1, + nonnegative_int o2) { + return lift_to_parallel_with_degrees( + output, o_sum, o_eq, FFOrdered{o0, o1, o2}); + }; SUBCASE("partition parallelism in non-softmax-dim (valid)") { - int degree0 = 2; - int degree2 = 4; + nonnegative_int degree0 = 2_n; + nonnegative_int degree2 = 4_n; - ParallelTensorShape par_input = - make_input(SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2); + ParallelTensorShape par_input = make_input( + SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2); SUBCASE("attrs.dim in bounds") { - SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}}; + SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}}; tl::expected result = get_output_shape(attrs, par_input); tl::expected correct = make_output( - SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2); + SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2); CHECK(result == correct); } SUBCASE("attrs.dims out of bounds") { - SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{4}}}; + SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{4_n}}; std::optional result = optional_from_expected(get_output_shape(attrs, par_input)); @@ -92,12 +98,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("partition parallism in softmax dim (invalid)") { - int degree1 = 2; + nonnegative_int degree1 = 2_n; - SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}}; + SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}}; ParallelTensorShape par_input = - make_input(SumDegree{1}, DiscardCopyDegree{1}, 1, degree1, 1); + make_input(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree1, 1_n); std::optional result = optional_from_expected(get_output_shape(attrs, par_input)); @@ -107,12 +113,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("sum parallelism (invalid)") { - SumDegree sum_degree = SumDegree{2}; + SumDegree sum_degree = SumDegree{2_n}; - SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}}; + SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}}; ParallelTensorShape par_input = - make_input(sum_degree, DiscardCopyDegree{1}, 1, 1, 1); + make_input(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); std::optional result = optional_from_expected(get_output_shape(attrs, par_input)); @@ -122,12 +128,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("discard copy parallelism (invalid)") { - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2}; + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n}; - SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}}; + SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}}; ParallelTensorShape par_input = - make_input(SumDegree{1}, discard_copy_degree, 1, 1, 1); + make_input(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n); std::optional result = optional_from_expected(get_output_shape(attrs, par_input)); diff --git a/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc b/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc index 73f5f0674d..1187bfcfbf 100644 --- a/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc +++ b/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc @@ -6,8 +6,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("PCGOperatorAttrs to/from json") { PCGOperatorAttrs correct = PCGOperatorAttrs{RepartitionAttrs{ - /*repartition_dim=*/ff_dim_t{nonnegative_int{1}}, - /*repartition_degree=*/4, + /*repartition_dim=*/ff_dim_t{1_n}, + /*repartition_degree=*/4_n, }}; nlohmann::json j = correct; auto result = j.get(); diff --git a/lib/op-attrs/test/src/op-attrs/relative_ff_dim_t.cc b/lib/op-attrs/test/src/op-attrs/relative_ff_dim_t.cc index c09c1ec3df..e3f3f4534e 100644 --- a/lib/op-attrs/test/src/op-attrs/relative_ff_dim_t.cc +++ b/lib/op-attrs/test/src/op-attrs/relative_ff_dim_t.cc @@ -5,13 +5,13 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ff_dim_t_from_relative_ff_dim_t") { - int input_dim = 5; + nonnegative_int input_dim = 5_n; SUBCASE("relative index is zero") { relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{0}; ff_dim_t ff_dim = ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim); - CHECK(ff_dim == ff_dim_t{nonnegative_int{0}}); + CHECK(ff_dim == ff_dim_t{0_n}); } SUBCASE("relative index is positive") { @@ -20,14 +20,14 @@ TEST_SUITE(FF_TEST_SUITE) { relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{1}; ff_dim_t ff_dim = ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim); - CHECK(ff_dim == ff_dim_t{nonnegative_int{1}}); + CHECK(ff_dim == ff_dim_t{1_n}); } SUBCASE("relative index is out of range") { relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{10}; ff_dim_t ff_dim = ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim); - CHECK(ff_dim == ff_dim_t{nonnegative_int{10}}); + CHECK(ff_dim == ff_dim_t{10_n}); } } @@ -37,7 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) { relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{-1}; ff_dim_t ff_dim = ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim); - CHECK(ff_dim == ff_dim_t{nonnegative_int{4}}); + CHECK(ff_dim == ff_dim_t{4_n}); } SUBCASE("relative index is out of range") { diff --git a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc index 60d87300c1..7e072d82d9 100644 --- a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc +++ b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc @@ -7,7 +7,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("tensor_dims_is_broadcastable_to(TensorDims, TensorDims)") { - TensorDims goal = TensorDims{FFOrdered{1, 1, 4, 3}}; + TensorDims goal = + TensorDims{FFOrdered{1_n, 1_n, 4_n, 3_n}}; SUBCASE("dims match") { bool result = tensor_dims_is_broadcastable_to(goal, goal); @@ -17,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("curr only needs num_dims promotion") { - TensorDims curr = TensorDims{FFOrdered{4, 3}}; + TensorDims curr = TensorDims{FFOrdered{4_n, 3_n}}; bool result = tensor_dims_is_broadcastable_to(curr, goal); bool correct = true; @@ -26,7 +27,8 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("curr only needs dim expansion") { - TensorDims curr = TensorDims{FFOrdered{1, 1, 1, 3}}; + TensorDims curr = + TensorDims{FFOrdered{1_n, 1_n, 1_n, 3_n}}; bool result = tensor_dims_is_broadcastable_to(curr, goal); bool correct = true; @@ -35,7 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("curr needs both num_dims promotion and dim expansion") { - TensorDims curr = TensorDims{FFOrdered{1, 3}}; + TensorDims curr = TensorDims{FFOrdered{1_n, 3_n}}; bool result = tensor_dims_is_broadcastable_to(curr, goal); bool correct = true; @@ -44,7 +46,8 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("curr needs invalid dim promotion") { - TensorDims curr = TensorDims{FFOrdered{1, 1, 2, 3}}; + TensorDims curr = + TensorDims{FFOrdered{1_n, 1_n, 2_n, 3_n}}; bool result = tensor_dims_is_broadcastable_to(curr, goal); bool correct = false; @@ -53,7 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("num_dims(goal) < num_dims(curr)") { - TensorDims curr = TensorDims{FFOrdered{1, 1, 10, 4, 3}}; + TensorDims curr = + TensorDims{FFOrdered{1_n, 1_n, 10_n, 4_n, 3_n}}; bool result = tensor_dims_is_broadcastable_to(curr, goal); bool correct = false; @@ -63,12 +67,13 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("get_broadcast_target_dims(std::unordered_set)") { - TensorDims d1 = TensorDims{FFOrdered{1, 10, 4, 3}}; + TensorDims d1 = TensorDims{FFOrdered{1_n, 10_n, 4_n, 3_n}}; - TensorDims d2 = TensorDims{FFOrdered{10, 4, 1}}; + TensorDims d2 = TensorDims{FFOrdered{10_n, 4_n, 1_n}}; SUBCASE("has target in inputs") { - TensorDims d3 = TensorDims{FFOrdered{1, 1, 4, 3}}; + TensorDims d3 = + TensorDims{FFOrdered{1_n, 1_n, 4_n, 3_n}}; std::optional result = get_broadcast_target_dims({d1, d2, d3}); @@ -78,7 +83,8 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("has no possible target") { - TensorDims d3 = TensorDims{FFOrdered{1, 1, 1, 4}}; + TensorDims d3 = + TensorDims{FFOrdered{1_n, 1_n, 1_n, 4_n}}; std::optional result = get_broadcast_target_dims({d1, d2, d3}); @@ -88,10 +94,11 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("has possible target, but not in inputs") { - TensorDims d3 = TensorDims{FFOrdered{1, 1, 1, 4, 3}}; + TensorDims d3 = + TensorDims{FFOrdered{1_n, 1_n, 1_n, 4_n, 3_n}}; TensorDims possible_target = - TensorDims{FFOrdered{1, 1, 10, 4, 3}}; + TensorDims{FFOrdered{1_n, 1_n, 10_n, 4_n, 3_n}}; REQUIRE(tensor_dims_is_broadcastable_to(d1, possible_target)); REQUIRE(tensor_dims_is_broadcastable_to(d2, possible_target)); diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h index df93f69f2e..290df8574e 100644 --- a/lib/pcg/include/pcg/computation_graph_builder.h +++ b/lib/pcg/include/pcg/computation_graph_builder.h @@ -85,15 +85,15 @@ struct ComputationGraphBuilder { // Add a 2D convolutional layer tensor_guid_t conv2d( tensor_guid_t const &input, - int outChannels, - int kernelH, - int kernelW, - int strideH, - int strideW, - int paddingH, - int paddingW, + nonnegative_int outChannels, + nonnegative_int kernelH, + nonnegative_int kernelW, + nonnegative_int strideH, + nonnegative_int strideW, + nonnegative_int paddingH, + nonnegative_int paddingW, std::optional const &activation = std::nullopt, - int groups = 1, + nonnegative_int groups = 1_n, bool use_bias = true, std::optional const &kernel_initializer = std::nullopt, std::optional const &bias_initializer = std::nullopt, @@ -107,8 +107,8 @@ struct ComputationGraphBuilder { // Add an embedding layer tensor_guid_t embedding( tensor_guid_t const &input, - int num_entries, - int outDim, + nonnegative_int num_entries, + nonnegative_int outDim, AggregateOp aggr, DataType dtype = DataType::FLOAT, std::optional const &kernel_initializer = std::nullopt, @@ -121,32 +121,32 @@ struct ComputationGraphBuilder { // Add a cache layer tensor_guid_t cache(tensor_guid_t const &input, - int num_batches, + nonnegative_int num_batches, std::function score_f = {}, std::optional const &name = std::nullopt); // Add a 2D pooling layer tensor_guid_t pool2d(tensor_guid_t const &input, - int kernelH, - int kernelW, - int strideH, - int strideW, - int paddingH, - int paddingW, + nonnegative_int kernelH, + nonnegative_int kernelW, + nonnegative_int strideH, + nonnegative_int strideW, + nonnegative_int paddingH, + nonnegative_int paddingW, PoolOp type = PoolOp::MAX, std::optional const &activation = std::nullopt, std::optional const &name = std::nullopt); tensor_guid_t adaptive_pool2d( tensor_guid_t const &input, - int output_h, - int output_w, + nonnegative_int output_h, + nonnegative_int output_w, PoolOp type = PoolOp::MAX, std::optional const &activation = std::nullopt, std::optional const &name = std::nullopt); tensor_guid_t layer_norm(tensor_guid_t const &input, - std::vector const &axes, + std::vector const &axes, bool elementwise_affine, float eps, std::optional const &name = std::nullopt); @@ -157,15 +157,15 @@ struct ComputationGraphBuilder { float eps, std::optional const &momentum, std::optional const &name = std::nullopt); - tensor_guid_t - batch_matmul(tensor_guid_t const &A, - tensor_guid_t const &B, - int a_seq_length_dim = -1, - int b_seq_length_dim = -1, - std::optional const &name = std::nullopt); + tensor_guid_t batch_matmul( + tensor_guid_t const &A, + tensor_guid_t const &B, + std::optional const &a_seq_length_dim = std::nullopt, + std::optional const &b_seq_length_dim = std::nullopt, + std::optional const &name = std::nullopt); tensor_guid_t dense( tensor_guid_t const &input, - int outDim, + nonnegative_int outDim, std::optional activation = std::nullopt, bool use_bias = true, DataType data_type = DataType::FLOAT, @@ -181,7 +181,7 @@ struct ComputationGraphBuilder { std::optional const &name = std::nullopt); // Add a concat layer tensor_guid_t concat(std::vector const &tensors, - int axis, + relative_ff_dim_t axis, std::optional const &name = std::nullopt); // Add a mean layer tensor_guid_t mean(tensor_guid_t const &input, @@ -191,47 +191,48 @@ struct ComputationGraphBuilder { // Add a split layer std::vector split(tensor_guid_t const &input, - std::vector const &split, - int axis, + std::vector const &split, + relative_ff_dim_t axis, std::optional const &name = std::nullopt); // Add a flat layer - tensor_guid_t flat(tensor_guid_t const &input, - int start_dim = 0, - std::optional const &end_dim = std::nullopt, - std::optional const &name = std::nullopt); + tensor_guid_t + flat(tensor_guid_t const &input, + relative_ff_dim_t start_dim = relative_ff_dim_t{0}, + std::optional const &end_dim = std::nullopt, + std::optional const &name = std::nullopt); // Add a softmax layer tensor_guid_t softmax(tensor_guid_t const &input, - std::optional dim = std::nullopt, + std::optional dim = std::nullopt, std::optional const &name = std::nullopt); // Create input tensors and constants tensor_guid_t transpose(tensor_guid_t const &input, - std::vector const &perm, + std::vector const &perm, std::optional const &name = std::nullopt); tensor_guid_t reduce_sum(tensor_guid_t const &input, - std::vector const &axes, + std::vector const &axes, bool keepdims = false, std::optional const &name = std::nullopt); tensor_guid_t reshape(tensor_guid_t const &input, - std::vector const &shape, + std::vector const &shape, std::optional const &name = std::nullopt); tensor_guid_t reverse(tensor_guid_t const &input, - int axis, + relative_ff_dim_t axis, std::optional const &name = std::nullopt); std::vector top_k(tensor_guid_t const &input, - int k, + nonnegative_int k, bool sorted, std::optional const &name = std::nullopt); tensor_guid_t multihead_attention( tensor_guid_t const &query, tensor_guid_t const &key, tensor_guid_t const &value, - int embed_dim, - int num_heads, - int kdim = 0, - int vdim = 0, + nonnegative_int embed_dim, + nonnegative_int num_heads, + nonnegative_int kdim = 0_n, + nonnegative_int vdim = 0_n, float dropout = 0.0f, bool bias = true, bool add_bias_kv = false, @@ -254,7 +255,7 @@ struct ComputationGraphBuilder { std::optional const &name = std::nullopt); std::vector get_outputs(LayerAttrs const &) const; - tensor_guid_t get_output(LayerAttrs const &, int idx) const; + tensor_guid_t get_output(LayerAttrs const &, nonnegative_int idx) const; std::vector add_layer(LayerAttrs const &layer, diff --git a/lib/pcg/include/pcg/cpu_id_t.struct.toml b/lib/pcg/include/pcg/cpu_id_t.struct.toml index 0492a937be..152debbded 100644 --- a/lib/pcg/include/pcg/cpu_id_t.struct.toml +++ b/lib/pcg/include/pcg/cpu_id_t.struct.toml @@ -9,6 +9,10 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + [[fields]] name = "cpu_index" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/pcg/include/pcg/device_id.h b/lib/pcg/include/pcg/device_id.h index 28cf30eaba..36ea9de6b3 100644 --- a/lib/pcg/include/pcg/device_id.h +++ b/lib/pcg/include/pcg/device_id.h @@ -13,9 +13,9 @@ device_id_t operator+(device_id_t, size_t); DeviceType get_device_type(device_id_t const &device_id); gpu_id_t unwrap_gpu(device_id_t); cpu_id_t unwrap_cpu(device_id_t); -int get_raw_id(device_id_t); +nonnegative_int get_raw_id(device_id_t); -device_id_t device_id_from_index(int, DeviceType); +device_id_t device_id_from_index(nonnegative_int, DeviceType); } // namespace FlexFlow diff --git a/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.h b/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.h index 05c486f0f7..9554995fa0 100644 --- a/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.h +++ b/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.h @@ -8,7 +8,7 @@ namespace FlexFlow { V1DataflowGraph to_v1(DataflowGraphView const &); V1DataflowGraph to_v1(DataflowGraphView const &, - std::unordered_map const &); + std::unordered_map const &); } // namespace FlexFlow diff --git a/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.struct.toml b/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.struct.toml index c332b6b41d..57b559a18e 100644 --- a/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.struct.toml +++ b/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.struct.toml @@ -13,6 +13,7 @@ includes = [ "", "", "pcg/file_format/v1/graphs/v1_graph_edge.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] src_includes = [ @@ -24,7 +25,7 @@ src_includes = [ [[fields]] name = "nodes" -type = "std::vector" +type = "std::vector<::FlexFlow::nonnegative_int>" [[fields]] name = "edges" diff --git a/lib/pcg/include/pcg/file_format/v1/graphs/v1_graph_edge.struct.toml b/lib/pcg/include/pcg/file_format/v1/graphs/v1_graph_edge.struct.toml index 752706fe1d..9150c20056 100644 --- a/lib/pcg/include/pcg/file_format/v1/graphs/v1_graph_edge.struct.toml +++ b/lib/pcg/include/pcg/file_format/v1/graphs/v1_graph_edge.struct.toml @@ -9,18 +9,22 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + [[fields]] name = "srcNode" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "srcIdx" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "dstNode" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "dstIdx" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.h b/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.h index fc9dfcef9a..426bad5a82 100644 --- a/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.h +++ b/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.h @@ -13,18 +13,19 @@ namespace FlexFlow { template -std::pair, bidict> +std::pair, + bidict> to_v1_including_node_numbering( LabelledDataflowGraphView const &g) { - bidict nodes = bidict_from_enumerating(get_nodes(g)); + bidict nodes = bidict_from_enumerating(get_nodes(g)); V1DataflowGraph unlabelled = to_v1(g, nodes.reversed()); - std::unordered_map node_labels = map_values( + std::unordered_map node_labels = map_values( nodes.as_unordered_map(), [&](Node const &n) { return g.at(n); }); - std::unordered_map> output_labels = + std::unordered_map> output_labels = map_values(nodes.as_unordered_map(), [&](Node const &n) { return transform(get_outputs(g, n), [&](DataflowOutput const &o) { return g.at(o); }); diff --git a/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.struct.toml b/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.struct.toml index b440d0f03d..1f69f5cd93 100644 --- a/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.struct.toml +++ b/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.struct.toml @@ -18,6 +18,7 @@ includes = [ "", "pcg/file_format/v1/graphs/v1_dataflow_graph.dtg.h", "pcg/file_format/v1/graphs/v1_graph_output.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] src_includes = [ @@ -29,11 +30,11 @@ src_includes = [ [[fields]] name = "node_labels" -type = "std::unordered_map" +type = "std::unordered_map<::FlexFlow::nonnegative_int, NodeLabel>" [[fields]] name = "output_labels" -type = "std::unordered_map>" +type = "std::unordered_map<::FlexFlow::nonnegative_int, std::vector>" [[fields]] name = "graph" diff --git a/lib/pcg/include/pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_sp_decomposition.variant.toml b/lib/pcg/include/pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_sp_decomposition.variant.toml index 0fe0b1761f..bd60564465 100644 --- a/lib/pcg/include/pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_sp_decomposition.variant.toml +++ b/lib/pcg/include/pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_sp_decomposition.variant.toml @@ -9,6 +9,7 @@ features = [ includes = [ "pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_series_split.dtg.h", "pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_parallel_split.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[values]] @@ -20,5 +21,5 @@ type = "::FlexFlow::V1BinaryParallelSplit" key = "parallel" [[values]] -type = "int" +type = "::FlexFlow::nonnegative_int" key = "leaf" diff --git a/lib/pcg/include/pcg/file_format/v1/v1_computation_graph.h b/lib/pcg/include/pcg/file_format/v1/v1_computation_graph.h index 5590d6999b..c0e9966425 100644 --- a/lib/pcg/include/pcg/file_format/v1/v1_computation_graph.h +++ b/lib/pcg/include/pcg/file_format/v1/v1_computation_graph.h @@ -9,7 +9,7 @@ namespace FlexFlow { V1ComputationGraph to_v1(ComputationGraph const &); -std::pair> +std::pair> to_v1_including_node_numbering(ComputationGraph const &); } // namespace FlexFlow diff --git a/lib/pcg/include/pcg/gpu_id_t.struct.toml b/lib/pcg/include/pcg/gpu_id_t.struct.toml index 170dbb96fa..7a85b4c0a7 100644 --- a/lib/pcg/include/pcg/gpu_id_t.struct.toml +++ b/lib/pcg/include/pcg/gpu_id_t.struct.toml @@ -9,6 +9,10 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + [[fields]] name = "gpu_index" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/pcg/include/pcg/machine_space_coordinate.struct.toml b/lib/pcg/include/pcg/machine_space_coordinate.struct.toml index 9b197a74c9..2528eab849 100644 --- a/lib/pcg/include/pcg/machine_space_coordinate.struct.toml +++ b/lib/pcg/include/pcg/machine_space_coordinate.struct.toml @@ -11,15 +11,16 @@ features = [ includes = [ "pcg/device_type.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] name = "node_idx" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "device_idx" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "device_type" diff --git a/lib/pcg/include/pcg/machine_specification.h b/lib/pcg/include/pcg/machine_specification.h index 39591e8a70..11c5a81bba 100644 --- a/lib/pcg/include/pcg/machine_specification.h +++ b/lib/pcg/include/pcg/machine_specification.h @@ -8,12 +8,12 @@ namespace FlexFlow { -int get_num_gpus(MachineSpecification const &ms); -int get_num_cpus(MachineSpecification const &ms); -int get_num_devices(MachineSpecification const &ms, - DeviceType const &device_type); -int get_num_devices_per_node(MachineSpecification const &ms, - DeviceType const &device_type); +nonnegative_int get_num_gpus(MachineSpecification const &ms); +nonnegative_int get_num_cpus(MachineSpecification const &ms); +nonnegative_int get_num_devices(MachineSpecification const &ms, + DeviceType const &device_type); +nonnegative_int get_num_devices_per_node(MachineSpecification const &ms, + DeviceType const &device_type); bool is_valid_machine_space_coordinate(MachineSpecification const &ms, MachineSpaceCoordinate const &coord); diff --git a/lib/pcg/include/pcg/machine_specification.struct.toml b/lib/pcg/include/pcg/machine_specification.struct.toml index e75b5018cb..7c624c7240 100644 --- a/lib/pcg/include/pcg/machine_specification.struct.toml +++ b/lib/pcg/include/pcg/machine_specification.struct.toml @@ -9,17 +9,21 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + [[fields]] name = "num_nodes" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "num_cpus_per_node" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "num_gpus_per_node" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "inter_node_bandwidth" diff --git a/lib/pcg/include/pcg/machine_view.h b/lib/pcg/include/pcg/machine_view.h index f72b2359dc..6ed9e7dd9c 100644 --- a/lib/pcg/include/pcg/machine_view.h +++ b/lib/pcg/include/pcg/machine_view.h @@ -5,7 +5,7 @@ #include "machine_view.dtg.h" #include "pcg/device_id_t.dtg.h" #include "pcg/operator_task_space.dtg.h" -#include "task_space_coordinate.dtg.h" +#include "pcg/task_space_coordinate.dtg.h" #include #include #include diff --git a/lib/pcg/include/pcg/operator_task_space.h b/lib/pcg/include/pcg/operator_task_space.h index 1a19397c72..b095fad088 100644 --- a/lib/pcg/include/pcg/operator_task_space.h +++ b/lib/pcg/include/pcg/operator_task_space.h @@ -16,8 +16,8 @@ std::unordered_set TaskSpaceCoordinate get_task_space_maximum_coordinate(OperatorTaskSpace const &task); -size_t num_dims(OperatorTaskSpace const &task); -size_t num_tasks(OperatorTaskSpace const &task); +nonnegative_int num_dims(OperatorTaskSpace const &task); +nonnegative_int num_tasks(OperatorTaskSpace const &task); OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg, parallel_layer_guid_t const &layer); diff --git a/lib/pcg/include/pcg/operator_task_space.struct.toml b/lib/pcg/include/pcg/operator_task_space.struct.toml index 3ab8b83173..9cc4f6b93a 100644 --- a/lib/pcg/include/pcg/operator_task_space.struct.toml +++ b/lib/pcg/include/pcg/operator_task_space.struct.toml @@ -11,6 +11,7 @@ features = [ includes = [ "", + "utils/nonnegative_int/nonnegative_int.h", ] src_includes = [ @@ -20,4 +21,4 @@ src_includes = [ [[fields]] name = "degrees" -type = "std::vector" +type = "std::vector<::FlexFlow::nonnegative_int>" diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h index 019b120936..faa9b73d95 100644 --- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h +++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h @@ -33,15 +33,15 @@ struct ParallelComputationGraphBuilder { parallel_tensor_guid_t conv2d( parallel_tensor_guid_t const &input, - int outChannels, - int kernelH, - int kernelW, - int strideH, - int strideW, - int paddingH, - int paddingW, + nonnegative_int outChannels, + nonnegative_int kernelH, + nonnegative_int kernelW, + nonnegative_int strideH, + nonnegative_int strideW, + nonnegative_int paddingH, + nonnegative_int paddingW, std::optional const &activation = std::nullopt, - int groups = 1, + nonnegative_int groups = 1_n, bool use_bias = true, std::optional const &kernel_initializer = std::nullopt, std::optional const &bias_initializer = std::nullopt, @@ -50,7 +50,7 @@ struct ParallelComputationGraphBuilder { parallel_tensor_guid_t dense( parallel_tensor_guid_t const &input, - int outDim, + nonnegative_int outDim, std::optional activation = std::nullopt, bool use_bias = true, DataType data_type = DataType::FLOAT, @@ -61,8 +61,8 @@ struct ParallelComputationGraphBuilder { parallel_tensor_guid_t embedding( parallel_tensor_guid_t const &input, - int num_entries, - int outDim, + nonnegative_int num_entries, + nonnegative_int outDim, AggregateOp aggr, DataType dtype = DataType::FLOAT, std::optional const &kernel_initializer = std::nullopt, @@ -72,10 +72,10 @@ struct ParallelComputationGraphBuilder { parallel_tensor_guid_t const &query, parallel_tensor_guid_t const &key, parallel_tensor_guid_t const &value, - int embed_dim, - int num_heads, - std::optional kdim = std::nullopt, - std::optional vdim = std::nullopt, + nonnegative_int embed_dim, + nonnegative_int num_heads, + std::optional kdim = std::nullopt, + std::optional vdim = std::nullopt, float dropout = 0.0f, bool bias = true, bool add_bias_kv = false, @@ -120,20 +120,20 @@ struct ParallelComputationGraphBuilder { parallel_tensor_guid_t parallel_partition(parallel_tensor_guid_t const &x, ff_dim_t dim, - int degree, + nonnegative_int degree, std::optional const &name = std::nullopt); parallel_tensor_guid_t parallel_combine(parallel_tensor_guid_t const &x, ff_dim_t dim, - int degree, + nonnegative_int degree, std::optional const &name = std::nullopt); parallel_tensor_guid_t parallel_replicate(parallel_tensor_guid_t const &x, - int degree, + nonnegative_int degree, std::optional const &name = std::nullopt); parallel_tensor_guid_t parallel_reduce(parallel_tensor_guid_t const &x, - int degree, + nonnegative_int degree, std::optional const &name = std::nullopt); private: diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_edge.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_edge.h index 7aac8558e4..5bce560020 100644 --- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_edge.h +++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_edge.h @@ -11,7 +11,7 @@ parallel_tensor_guid_t get_parallel_tensor(ParallelComputationGraphEdge const &); parallel_layer_guid_t get_src_layer(ParallelComputationGraphEdge const &); parallel_layer_guid_t get_dst_layer(ParallelComputationGraphEdge const &); -int get_dst_layer_input_idx(ParallelComputationGraphEdge const &); +nonnegative_int get_dst_layer_input_idx(ParallelComputationGraphEdge const &); } // namespace FlexFlow diff --git a/lib/pcg/include/pcg/start_invariant_machine_view.h b/lib/pcg/include/pcg/start_invariant_machine_view.h index f5091c69d1..cdf17213f9 100644 --- a/lib/pcg/include/pcg/start_invariant_machine_view.h +++ b/lib/pcg/include/pcg/start_invariant_machine_view.h @@ -17,7 +17,7 @@ MachineView StartInvariantMachineView start_invariant_from_machine_view(MachineView const &mv); -size_t num_dims(StartInvariantMachineView const &mv); +nonnegative_int num_dims(StartInvariantMachineView const &mv); DeviceType get_device_type(StartInvariantMachineView const &mv); diff --git a/lib/pcg/include/pcg/stride_t.struct.toml b/lib/pcg/include/pcg/stride_t.struct.toml index a764497b8b..8d950c5f39 100644 --- a/lib/pcg/include/pcg/stride_t.struct.toml +++ b/lib/pcg/include/pcg/stride_t.struct.toml @@ -9,6 +9,10 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + [[fields]] name = "unwrapped" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/pcg/include/pcg/task_space_coordinate.struct.toml b/lib/pcg/include/pcg/task_space_coordinate.struct.toml index 65aea167cb..1057676b8e 100644 --- a/lib/pcg/include/pcg/task_space_coordinate.struct.toml +++ b/lib/pcg/include/pcg/task_space_coordinate.struct.toml @@ -11,6 +11,7 @@ features = [ includes = [ "", + "utils/nonnegative_int/nonnegative_int.h", ] src_includes = [ @@ -20,4 +21,4 @@ src_includes = [ [[fields]] name = "raw_coord" -type = "std::vector" +type = "std::vector<::FlexFlow::nonnegative_int>" diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc index 09772fa9d9..0d07c43a91 100644 --- a/lib/pcg/src/pcg/computation_graph_builder.cc +++ b/lib/pcg/src/pcg/computation_graph_builder.cc @@ -376,30 +376,32 @@ tensor_guid_t tensor_guid_t ComputationGraphBuilder::conv2d( tensor_guid_t const &x, - int outChannels, - int kernelH, - int kernelW, - int strideH, - int strideW, - int paddingH, - int paddingW, + nonnegative_int outChannels, + nonnegative_int kernelH, + nonnegative_int kernelW, + nonnegative_int strideH, + nonnegative_int strideW, + nonnegative_int paddingH, + nonnegative_int paddingW, std::optional const &activation, - int groups, + nonnegative_int groups, bool use_bias, std::optional const &kernel_initializer, std::optional const &bias_initializer, std::optional const &kernel_regularizer, std::optional const &maybe_name) { - Conv2DAttrs attrs = Conv2DAttrs{outChannels, - kernelH, - kernelW, - strideH, - strideW, - paddingH, - paddingW, - groups, - activation, - use_bias}; + Conv2DAttrs attrs = Conv2DAttrs{ + /*out_channels=*/outChannels, + /*kernel_h=*/kernelH, + /*kernel_w=*/kernelW, + /*stride_h=*/strideH, + /*stride_w=*/strideW, + /*padding_h=*/paddingH, + /*padding_w=*/paddingW, + /*groups=*/groups, + /*activation=*/activation, + /*use_bias=*/use_bias, + }; std::string name = maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs})); @@ -451,13 +453,18 @@ tensor_guid_t ComputationGraphBuilder::dropout( tensor_guid_t ComputationGraphBuilder::embedding( tensor_guid_t const &x, - int num_entries, - int outDim, + nonnegative_int num_entries, + nonnegative_int outDim, AggregateOp aggr, DataType dtype, std::optional const &kernel_initializer, std::optional const &maybe_name) { - EmbeddingAttrs attrs = EmbeddingAttrs{num_entries, outDim, aggr, dtype}; + EmbeddingAttrs attrs = EmbeddingAttrs{ + /*num_entries=*/num_entries, + /*out_channels=*/outDim, + /*aggr=*/aggr, + /*data_type=*/dtype, + }; std::string name = maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs})); @@ -509,12 +516,12 @@ tensor_guid_t ComputationGraphBuilder::gather( } tensor_guid_t ComputationGraphBuilder::pool2d( tensor_guid_t const &x, - int kernelH, - int kernelW, - int strideH, - int strideW, - int paddingH, - int paddingW, + nonnegative_int kernelH, + nonnegative_int kernelW, + nonnegative_int strideH, + nonnegative_int strideW, + nonnegative_int paddingH, + nonnegative_int paddingW, PoolOp type, std::optional const &activation, std::optional const &maybe_name) { @@ -547,8 +554,8 @@ tensor_guid_t ComputationGraphBuilder::pool2d( tensor_guid_t ComputationGraphBuilder::adaptive_pool2d( tensor_guid_t const &uncasted_input, - int output_h, - int output_w, + nonnegative_int output_h, + nonnegative_int output_w, PoolOp type, std::optional const &activation, std::optional const &maybe_name) { @@ -637,10 +644,10 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention( tensor_guid_t const &query, tensor_guid_t const &key, tensor_guid_t const &value, - int embed_dim, - int num_heads, - int kdim, - int vdim, + nonnegative_int embed_dim, + nonnegative_int num_heads, + nonnegative_int kdim, + nonnegative_int vdim, float dropout, bool bias, bool add_bias_kv, @@ -662,14 +669,16 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention( "If you need this functionality, please create an issue."); } - MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{embed_dim, - num_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn}; + MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ + /*embed_dim=*/embed_dim, + /*num_heads=*/num_heads, + /*kdim=*/kdim, + /*vdim=*/vdim, + /*dropout=*/dropout, + /*bias=*/bias, + /*add_bias_kv=*/add_bias_kv, + /*add_zero_attn=*/add_zero_attn, + }; std::string name = maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs})); @@ -743,7 +752,7 @@ TensorDims ComputationGraphBuilder::get_broadcast_target_dims( tensor_guid_t ComputationGraphBuilder::dense( tensor_guid_t const &input, - int outDim, + nonnegative_int outDim, std::optional activation, bool use_bias, DataType data_type, @@ -752,8 +761,13 @@ tensor_guid_t ComputationGraphBuilder::dense( std::optional const &maybe_name, std::optional const &projection_name, std::optional const &bias_name) { - LinearAttrs attrs = - LinearAttrs{outDim, use_bias, data_type, activation, std::nullopt}; + LinearAttrs attrs = LinearAttrs{ + /*out_channels=*/outDim, + /*use_bias=*/use_bias, + /*data_type=*/data_type, + /*activation=*/activation, + /*regularizer=*/std::nullopt, + }; std::string name = maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs})); @@ -794,12 +808,11 @@ tensor_guid_t ComputationGraphBuilder::dense( tensor_guid_t ComputationGraphBuilder::concat( std::vector const &inputs, - int axis, + relative_ff_dim_t axis, std::optional const &maybe_name) { - relative_ff_dim_t wrapped_axis = relative_ff_dim_t{axis}; ConcatAttrs attrs = ConcatAttrs{ff_dim_t_from_relative_ff_dim_t( - wrapped_axis, num_dims(this->get_shape(inputs[0])))}; + axis, num_dims(this->get_shape(inputs[0])))}; std::string name = maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs})); @@ -817,17 +830,17 @@ tensor_guid_t ComputationGraphBuilder::concat( tensor_guid_t ComputationGraphBuilder::flat( tensor_guid_t const &input, - int start_dim, - std::optional const &end_dim, + relative_ff_dim_t start_dim, + std::optional const &end_dim, std::optional const &maybe_name) { - int input_num_dims = num_dims(this->get_shape(input)); + nonnegative_int input_num_dims = num_dims(this->get_shape(input)); FlatAttrs attrs = FlatAttrs{ - /*start_dim=*/ff_dim_t_from_relative_ff_dim_t( - relative_ff_dim_t{start_dim}, input_num_dims), + /*start_dim=*/ff_dim_t_from_relative_ff_dim_t(start_dim, input_num_dims), /*end_dim=*/ - ff_dim_t_from_relative_ff_dim_t( - relative_ff_dim_t{end_dim.value_or(input_num_dims)}, input_num_dims), + ff_dim_t_from_relative_ff_dim_t(end_dim.value_or(relative_ff_dim_t{ + input_num_dims.unwrap_nonnegative()}), + input_num_dims), }; std::string name = @@ -843,16 +856,15 @@ tensor_guid_t ComputationGraphBuilder::flat( tensor_guid_t ComputationGraphBuilder::layer_norm( tensor_guid_t const &input, - std::vector const &relative_axes, + std::vector const &relative_axes, bool elementwise_affine, float eps, std::optional const &maybe_name) { TensorShape input_shape = this->get_shape(input); - auto resolve_dim_idx = [&](int dim_idx) { - return ff_dim_t_from_relative_ff_dim_t(relative_ff_dim_t{dim_idx}, - num_dims(input_shape)); + auto resolve_dim_idx = [&](relative_ff_dim_t dim_idx) { + return ff_dim_t_from_relative_ff_dim_t(dim_idx, num_dims(input_shape)); }; stack_vector axes = stack_vector_of( @@ -910,15 +922,16 @@ tensor_guid_t ComputationGraphBuilder::layer_norm( tensor_guid_t ComputationGraphBuilder::softmax( tensor_guid_t const &input, - std::optional maybe_dim, + std::optional maybe_dim, std::optional const &maybe_name) { TensorShape input_shape = this->get_shape(input); - int dim = maybe_dim.value_or(num_dims(input_shape) - 1); + relative_ff_dim_t dim = maybe_dim.value_or( + relative_ff_dim_t{num_dims(input_shape).unwrap_nonnegative() - 1}); - SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t_from_relative_ff_dim_t( - relative_ff_dim_t{dim}, num_dims(input_shape))}; + SoftmaxAttrs attrs = + SoftmaxAttrs{ff_dim_t_from_relative_ff_dim_t(dim, num_dims(input_shape))}; if (attrs.dim.value >= num_dims(input_shape)) { throw mk_runtime_error( diff --git a/lib/pcg/src/pcg/device_id.cc b/lib/pcg/src/pcg/device_id.cc index a8cfe1f82f..1a4f7b7d22 100644 --- a/lib/pcg/src/pcg/device_id.cc +++ b/lib/pcg/src/pcg/device_id.cc @@ -25,7 +25,7 @@ cpu_id_t unwrap_cpu(device_id_t device_id) { return device_id.get(); } -int get_raw_id(device_id_t device_id) { +nonnegative_int get_raw_id(device_id_t device_id) { switch (get_device_type(device_id)) { case DeviceType::GPU: return unwrap_gpu(device_id).gpu_index; @@ -36,7 +36,7 @@ int get_raw_id(device_id_t device_id) { } } -device_id_t device_id_from_index(int idx, DeviceType device_type) { +device_id_t device_id_from_index(nonnegative_int idx, DeviceType device_type) { switch (device_type) { case DeviceType::GPU: return device_id_t{gpu_id_t{idx}}; diff --git a/lib/pcg/src/pcg/file_format/v1/graphs/v1_dataflow_graph.cc b/lib/pcg/src/pcg/file_format/v1/graphs/v1_dataflow_graph.cc index cf150a339f..064e2d81d3 100644 --- a/lib/pcg/src/pcg/file_format/v1/graphs/v1_dataflow_graph.cc +++ b/lib/pcg/src/pcg/file_format/v1/graphs/v1_dataflow_graph.cc @@ -10,15 +10,15 @@ namespace FlexFlow { V1DataflowGraph to_v1(DataflowGraphView const &g) { - bidict node_enumeration_bidict = + bidict node_enumeration_bidict = bidict_from_enumerating(get_nodes(g)); - std::unordered_map node_enumeration = + std::unordered_map node_enumeration = node_enumeration_bidict.reversed().as_unordered_map(); return to_v1(g, node_enumeration); } V1DataflowGraph to_v1(DataflowGraphView const &g, - std::unordered_map const &nodes) { + std::unordered_map const &nodes) { std::unordered_set edges; for (DataflowEdge const &e : get_edges(g)) { edges.insert(V1GraphEdge{ diff --git a/lib/pcg/src/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.cc b/lib/pcg/src/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.cc index d353ccdda3..ac819db342 100644 --- a/lib/pcg/src/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.cc +++ b/lib/pcg/src/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.cc @@ -1 +1,17 @@ #include "pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using NodeLabel = value_type<0>; +using OutputLabel = value_type<1>; + +template std::pair, + bidict> + to_v1_including_node_numbering( + LabelledDataflowGraphView const &); + +template V1LabelledDataflowGraph + to_v1(LabelledDataflowGraphView const &); + +} // namespace FlexFlow diff --git a/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc b/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc index 5341e03c0a..d39652a7e2 100644 --- a/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc +++ b/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc @@ -21,7 +21,7 @@ V1BinarySPDecomposition }; } else if (type == "leaf") { return V1BinarySPDecomposition{ - j.at("value").get(), + j.at("value").get(), }; } else { throw mk_runtime_error(fmt::format( @@ -45,7 +45,7 @@ void adl_serializer::to_json( j["type"] = "parallel"; return std::monostate{}; }, - [&](int leaf) { + [&](nonnegative_int leaf) { j["value"] = leaf; j["type"] = "leaf"; return std::monostate{}; diff --git a/lib/pcg/src/pcg/file_format/v1/v1_computation_graph.cc b/lib/pcg/src/pcg/file_format/v1/v1_computation_graph.cc index 975e92dfb7..3511ccc269 100644 --- a/lib/pcg/src/pcg/file_format/v1/v1_computation_graph.cc +++ b/lib/pcg/src/pcg/file_format/v1/v1_computation_graph.cc @@ -9,13 +9,14 @@ V1ComputationGraph to_v1(ComputationGraph const &g) { }; } -std::pair> +std::pair> to_v1_including_node_numbering(ComputationGraph const &cg) { - std::pair, bidict> + std::pair, + bidict> raw = to_v1_including_node_numbering(cg.raw_graph); V1ComputationGraph v1_cg = V1ComputationGraph{raw.first}; - bidict v1_node_ids = + bidict v1_node_ids = map_values(raw.second, [](Node const &n) { return layer_guid_t{n}; }); return {v1_cg, v1_node_ids}; diff --git a/lib/pcg/src/pcg/machine_space_offset.cc b/lib/pcg/src/pcg/machine_space_offset.cc index 9990023f8c..4aa79b3d1b 100644 --- a/lib/pcg/src/pcg/machine_space_offset.cc +++ b/lib/pcg/src/pcg/machine_space_offset.cc @@ -17,8 +17,10 @@ MachineSpaceOffset get_machine_space_offset_from_coordinate( fmt::format("{} has different DeviceType from {}", start, coord)); } - return MachineSpaceOffset{coord.node_idx - start.node_idx, - coord.device_idx - start.device_idx, + return MachineSpaceOffset{coord.node_idx.unwrap_nonnegative() - + start.node_idx.unwrap_nonnegative(), + coord.device_idx.unwrap_nonnegative() - + start.device_idx.unwrap_nonnegative(), coord.device_type}; } diff --git a/lib/pcg/src/pcg/machine_specification.cc b/lib/pcg/src/pcg/machine_specification.cc index 19ff50b4b7..0fefeddd27 100644 --- a/lib/pcg/src/pcg/machine_specification.cc +++ b/lib/pcg/src/pcg/machine_specification.cc @@ -4,14 +4,16 @@ #include "utils/exception.h" namespace FlexFlow { -int get_num_gpus(MachineSpecification const &ms) { +nonnegative_int get_num_gpus(MachineSpecification const &ms) { return ms.num_nodes * ms.num_gpus_per_node; } -int get_num_cpus(MachineSpecification const &ms) { + +nonnegative_int get_num_cpus(MachineSpecification const &ms) { return ms.num_nodes * ms.num_cpus_per_node; } -int get_num_devices(MachineSpecification const &ms, - DeviceType const &device_type) { + +nonnegative_int get_num_devices(MachineSpecification const &ms, + DeviceType const &device_type) { switch (device_type) { case DeviceType::GPU: return get_num_gpus(ms); @@ -22,8 +24,8 @@ int get_num_devices(MachineSpecification const &ms, } } -int get_num_devices_per_node(MachineSpecification const &ms, - DeviceType const &device_type) { +nonnegative_int get_num_devices_per_node(MachineSpecification const &ms, + DeviceType const &device_type) { switch (device_type) { case DeviceType::GPU: return ms.num_gpus_per_node; @@ -33,6 +35,7 @@ int get_num_devices_per_node(MachineSpecification const &ms, throw mk_runtime_error(fmt::format("Unknown DeviceType {}", device_type)); } } + bool is_valid_machine_space_coordinate(MachineSpecification const &ms, MachineSpaceCoordinate const &coord) { return (coord.node_idx < ms.num_nodes) && @@ -45,7 +48,7 @@ device_id_t get_device_id(MachineSpecification const &ms, throw mk_runtime_error(fmt::format( "Invalid coordinate {} for machine specification {}", ms, coord)); } - int raw_idx = + nonnegative_int raw_idx = coord.node_idx * get_num_devices_per_node(ms, coord.device_type) + coord.device_idx; return device_id_from_index(raw_idx, coord.device_type); diff --git a/lib/pcg/src/pcg/machine_view.cc b/lib/pcg/src/pcg/machine_view.cc index cc42ad83b2..fe319dc63c 100644 --- a/lib/pcg/src/pcg/machine_view.cc +++ b/lib/pcg/src/pcg/machine_view.cc @@ -16,6 +16,9 @@ #include "utils/containers/transform.h" #include "utils/containers/zip.h" #include "utils/exception.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/nonnegative_int/num_elements.h" + namespace FlexFlow { size_t num_dims(MachineView const &mv) { @@ -71,47 +74,57 @@ std::optional get_machine_space_coordinate( } auto get_dimension_indices_for_dimension = - [&](MachineSpecificationDimension dimension) { - std::vector mv_dimensions = - get_dimensions(machine_view); - return filter(count(mv_dimensions.size()), [&](size_t idx) { - return mv_dimensions.at(idx) == dimension; - }); - }; - - auto compute_index = [&](int start_idx, - std::vector const &dimension_indices) { - std::vector mv_strides = get_strides(machine_view); - - std::vector sizes = transform(dimension_indices, [&](size_t i) { - return task.degrees.at(i) * mv_strides.at(i).unwrapped; - }); - std::vector coord_points = transform( - dimension_indices, [&](size_t i) { return coord.raw_coord.at(i); }); - std::vector strides = transform(dimension_indices, [&](size_t i) { - return mv_strides.at(i).unwrapped; - }); - - std::vector coeffs = scanl(sizes, 1, std::multiplies()); - - int index = start_idx; - for (auto [coeff, coord_point, stride] : - zip(coeffs, coord_points, strides)) { - index += coeff * coord_point * stride; - } - return index; + [&](MachineSpecificationDimension dimension) + -> std::vector { + std::vector mv_dimensions = + get_dimensions(machine_view); + return filter(nonnegative_range(num_elements(mv_dimensions)), + [&](nonnegative_int idx) { + return mv_dimensions.at(idx.unwrap_nonnegative()) == + dimension; + }); }; - std::vector inter_dimension_indices = + auto compute_index = + [&](nonnegative_int start_idx, + std::vector const &dimension_indices) { + std::vector mv_strides = get_strides(machine_view); + + std::vector sizes = + transform(dimension_indices, [&](nonnegative_int i) { + return task.degrees.at(i.unwrap_nonnegative()) * + mv_strides.at(i.unwrap_nonnegative()).unwrapped; + }); + std::vector coord_points = + transform(dimension_indices, [&](nonnegative_int i) { + return coord.raw_coord.at(i.unwrap_nonnegative()); + }); + std::vector strides = + transform(dimension_indices, [&](nonnegative_int i) { + return mv_strides.at(i.unwrap_nonnegative()).unwrapped; + }); + + std::vector coeffs = scanl( + sizes, nonnegative_int{1}, std::multiplies()); + + nonnegative_int index = start_idx; + for (auto [coeff, coord_point, stride] : + zip(coeffs, coord_points, strides)) { + index += coeff * coord_point * stride; + } + return index; + }; + + std::vector inter_dimension_indices = get_dimension_indices_for_dimension( MachineSpecificationDimension::INTER_NODE); - std::vector intra_dimension_indices = + std::vector intra_dimension_indices = get_dimension_indices_for_dimension( MachineSpecificationDimension::INTRA_NODE); - int node_idx = + nonnegative_int node_idx = compute_index(machine_view.start.node_idx, inter_dimension_indices); - int device_idx = + nonnegative_int device_idx = compute_index(machine_view.start.device_idx, intra_dimension_indices); MachineSpaceCoordinate ms_coord = MachineSpaceCoordinate{ node_idx, device_idx, get_device_type(machine_view)}; diff --git a/lib/pcg/src/pcg/operator_task_space.cc b/lib/pcg/src/pcg/operator_task_space.cc index 7157b75082..57af6eedc7 100644 --- a/lib/pcg/src/pcg/operator_task_space.cc +++ b/lib/pcg/src/pcg/operator_task_space.cc @@ -14,18 +14,23 @@ #include "utils/containers/unordered_set_of.h" #include "utils/containers/vector_of.h" #include "utils/fmt/unordered_set.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/nonnegative_int/num_elements.h" + namespace FlexFlow { std::unordered_set get_task_space_coordinates(OperatorTaskSpace const &task) { - std::vector> coordinate_ranges = transform( - task.degrees, [&](int const &num_points) { return range(num_points); }); + std::vector> coordinate_ranges = + transform(task.degrees, [&](nonnegative_int num_points) { + return nonnegative_range(num_points); + }); - std::unordered_set> raw_coordinates = + std::unordered_set> raw_coordinates = unordered_set_of(cartesian_product(coordinate_ranges)); std::unordered_set task_space_coordinates = - transform(raw_coordinates, [](std::vector const &point) { + transform(raw_coordinates, [](std::vector const &point) { return TaskSpaceCoordinate{point}; }); return task_space_coordinates; @@ -36,10 +41,11 @@ TaskSpaceCoordinate return maximum(get_task_space_coordinates(task)); } -size_t num_dims(OperatorTaskSpace const &task) { - return task.degrees.size(); +nonnegative_int num_dims(OperatorTaskSpace const &task) { + return num_elements(task.degrees); } -size_t num_tasks(OperatorTaskSpace const &task) { + +nonnegative_int num_tasks(OperatorTaskSpace const &task) { return product(task.degrees); } @@ -48,7 +54,7 @@ OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg, parallel_tensor_guid_t out_tensor = get_layer_outputs(pcg, layer).at(0); ParallelTensorShape shape = get_parallel_tensor_shape(pcg, out_tensor); - std::vector degrees; + std::vector degrees; extend(degrees, vector_of(ff_ordered_shard_degrees(shape))); degrees.push_back(get_sum_degree(shape)); degrees.push_back(get_discard_copy_degree(shape)); diff --git a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc index dadad6277f..2cf149f78a 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc @@ -9,7 +9,7 @@ std::unordered_set ParallelTensorShape const &goal) { std::unordered_set result; - int sum_degree = get_sum_degree(goal); + nonnegative_int sum_degree = get_sum_degree(goal); if (sum_degree != 1) { throw mk_runtime_error( fmt::format("generate_weight_transform currently only supports " @@ -17,7 +17,7 @@ std::unordered_set sum_degree)); } - int discard_copy_degree = get_discard_copy_degree(goal); + nonnegative_int discard_copy_degree = get_discard_copy_degree(goal); if (discard_copy_degree != 1) { result.insert(ParallelOpAttrs{ReplicateAttrs{discard_copy_degree}}); } diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc index e2f4555328..16896347e0 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc @@ -108,8 +108,8 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::batch_matmul( std::optional const &maybe_name) { BatchMatmulAttrs attrs = BatchMatmulAttrs{ - /*a_seq_length_dim=*/-1, - /*b_seq_length_dim=*/-1, + /*a_seq_length_dim=*/std::nullopt, + /*b_seq_length_dim=*/std::nullopt, }; std::string name = @@ -141,30 +141,32 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::cast( parallel_tensor_guid_t ParallelComputationGraphBuilder::conv2d( parallel_tensor_guid_t const &raw_input, - int outChannels, - int kernelH, - int kernelW, - int strideH, - int strideW, - int paddingH, - int paddingW, + nonnegative_int outChannels, + nonnegative_int kernelH, + nonnegative_int kernelW, + nonnegative_int strideH, + nonnegative_int strideW, + nonnegative_int paddingH, + nonnegative_int paddingW, std::optional const &activation, - int groups, + nonnegative_int groups, bool use_bias, std::optional const &kernel_initializer, std::optional const &bias_initializer, std::optional const &kernel_regularizer, std::optional const &maybe_name) { - Conv2DAttrs attrs = Conv2DAttrs{outChannels, - kernelH, - kernelW, - strideH, - strideW, - paddingH, - paddingW, - groups, - activation, - use_bias}; + Conv2DAttrs attrs = Conv2DAttrs{ + /*out_channels=*/outChannels, + /*kernel_h=*/kernelH, + /*kernel_w=*/kernelW, + /*stride_h=*/strideH, + /*stride_w=*/strideW, + /*padding_h=*/paddingH, + /*padding_w=*/paddingW, + /*groups=*/groups, + /*activation=*/activation, + /*use_bias=*/use_bias, + }; std::string name = maybe_name.value_or(get_default_name(PCGOperatorAttrs{attrs})); @@ -192,7 +194,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::conv2d( parallel_tensor_guid_t ParallelComputationGraphBuilder::dense( parallel_tensor_guid_t const &input, - int outDim, + nonnegative_int outDim, std::optional activation, bool use_bias, DataType data_type, @@ -200,11 +202,11 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::dense( std::optional const &bias_initializer, std::optional const &maybe_name) { LinearAttrs attrs = LinearAttrs{ - outDim, - use_bias, - data_type, - activation, - std::nullopt, + /*out_channels=*/outDim, + /*use_bias=*/use_bias, + /*data_type=*/data_type, + /*activation=*/activation, + /*regularizer=*/std::nullopt, }; std::string name = @@ -239,18 +241,18 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::dense( parallel_tensor_guid_t ParallelComputationGraphBuilder::embedding( parallel_tensor_guid_t const &input, - int num_entries, - int outDim, + nonnegative_int num_entries, + nonnegative_int outDim, AggregateOp aggr, DataType dtype, std::optional const &kernel_initializer, std::optional const &maybe_name) { EmbeddingAttrs attrs = EmbeddingAttrs{ - num_entries, - outDim, - aggr, - dtype, + /*num_entries=*/num_entries, + /*out_channels=*/outDim, + /*aggr=*/aggr, + /*data_type=*/dtype, }; std::string name = @@ -274,10 +276,10 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::multihead_attention( parallel_tensor_guid_t const &query, parallel_tensor_guid_t const &key, parallel_tensor_guid_t const &value, - int embed_dim, - int num_heads, - std::optional maybe_kdim, - std::optional maybe_vdim, + nonnegative_int embed_dim, + nonnegative_int num_heads, + std::optional maybe_kdim, + std::optional maybe_vdim, float dropout, bool bias, bool add_bias_kv, @@ -287,8 +289,8 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::multihead_attention( std::optional output_bias_initializer, std::optional const &maybe_name) { - int kdim = maybe_kdim.value_or(embed_dim); - int vdim = maybe_vdim.value_or(embed_dim); + nonnegative_int kdim = maybe_kdim.value_or(embed_dim); + nonnegative_int vdim = maybe_vdim.value_or(embed_dim); MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ /*embed_dim=*/embed_dim, @@ -491,10 +493,13 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::elu( parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_partition( parallel_tensor_guid_t const &input, ff_dim_t dim, - int degree, + nonnegative_int degree, std::optional const &maybe_name) { - RepartitionAttrs attrs = RepartitionAttrs{dim, degree}; + RepartitionAttrs attrs = RepartitionAttrs{ + /*repartition_dim=*/dim, + /*repartition_degree=*/degree, + }; std::string name = maybe_name.value_or(get_default_name(PCGOperatorAttrs{attrs})); @@ -510,10 +515,13 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_partition( parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_combine( parallel_tensor_guid_t const &input, ff_dim_t dim, - int degree, + nonnegative_int degree, std::optional const &maybe_name) { - CombineAttrs attrs = CombineAttrs{dim, degree}; + CombineAttrs attrs = CombineAttrs{ + /*combine_dim=*/dim, + /*combine_degree=*/degree, + }; std::string name = maybe_name.value_or(get_default_name(PCGOperatorAttrs{attrs})); @@ -528,7 +536,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_combine( parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_replicate( parallel_tensor_guid_t const &input, - int degree, + nonnegative_int degree, std::optional const &maybe_name) { ReplicateAttrs attrs = ReplicateAttrs{degree}; @@ -546,7 +554,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_replicate( parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_reduce( parallel_tensor_guid_t const &input, - int degree, + nonnegative_int degree, std::optional const &maybe_name) { ReductionAttrs attrs = ReductionAttrs{degree}; @@ -662,7 +670,7 @@ std::vector ParallelComputationGraphBuilder::add_layer( std::vector raw_weight_tensors; for (auto const &kv : enumerate_vector(weights)) { - int weight_idx = kv.first; + nonnegative_int weight_idx = kv.first; ParallelTensorAttrs weight_tensor_attrs = kv.second; std::optional weight_name = diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc index d30739486e..f37d08dc8a 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc @@ -16,7 +16,7 @@ parallel_layer_guid_t get_dst_layer(ParallelComputationGraphEdge const &e) { return parallel_layer_guid_t{e.raw_edge.dst.node}; } -int get_dst_layer_input_idx(ParallelComputationGraphEdge const &e) { +nonnegative_int get_dst_layer_input_idx(ParallelComputationGraphEdge const &e) { return e.raw_edge.dst.idx; } diff --git a/lib/pcg/src/pcg/start_invariant_machine_view.cc b/lib/pcg/src/pcg/start_invariant_machine_view.cc index 1fcc3ea12f..e9f864d416 100644 --- a/lib/pcg/src/pcg/start_invariant_machine_view.cc +++ b/lib/pcg/src/pcg/start_invariant_machine_view.cc @@ -7,6 +7,7 @@ #include "utils/containers/scanl.h" #include "utils/containers/transform.h" #include "utils/containers/zip.h" +#include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { MachineView machine_view_from_start_invariant( @@ -20,8 +21,8 @@ StartInvariantMachineView return StartInvariantMachineView{mv.dimensions, get_device_type(mv)}; } -size_t num_dims(StartInvariantMachineView const &start_inv_mv) { - return start_inv_mv.dimensions.size(); +nonnegative_int num_dims(StartInvariantMachineView const &start_inv_mv) { + return num_elements(start_inv_mv.dimensions); } DeviceType get_device_type(StartInvariantMachineView const &start_inv_mv) { @@ -59,7 +60,7 @@ std::optional get_machine_space_offset( TaskSpaceCoordinate const &coord, MachineSpecification const &machine_specification) { MachineSpaceCoordinate dummy_start = - MachineSpaceCoordinate{0, 0, get_device_type(start_inv_machine_view)}; + MachineSpaceCoordinate{0_n, 0_n, get_device_type(start_inv_machine_view)}; MachineView mv = machine_view_from_start_invariant(start_inv_machine_view, dummy_start); std::optional ms_coord = diff --git a/lib/pcg/test/src/pcg/computation_graph.cc b/lib/pcg/test/src/pcg/computation_graph.cc index e2ed51b2f1..d92d65ad7b 100644 --- a/lib/pcg/test/src/pcg/computation_graph.cc +++ b/lib/pcg/test/src/pcg/computation_graph.cc @@ -13,9 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 10, - 12, + TensorDims{FFOrdered{ + 10_n, + 12_n, }}, DataType::FLOAT, }; @@ -40,9 +40,9 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 10, - 12, + TensorDims{FFOrdered{ + 10_n, + 12_n, }}, DataType::FLOAT, }; @@ -66,16 +66,16 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 10, - 12, + TensorDims{FFOrdered{ + 10_n, + 12_n, }}, DataType::FLOAT, }; tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES); b.dense(input, - /*outDim=*/14, + /*outDim=*/14_n, /*activation=*/Activation::RELU, /*use_bias=*/true, /*data_type=*/DataType::FLOAT, @@ -103,9 +103,9 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 10, - 12, + TensorDims{FFOrdered{ + 10_n, + 12_n, }}, DataType::FLOAT, }; @@ -131,9 +131,9 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 10, - 12, + TensorDims{FFOrdered{ + 10_n, + 12_n, }}, DataType::FLOAT, }; @@ -161,16 +161,16 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 10, - 12, + TensorDims{FFOrdered{ + 10_n, + 12_n, }}, DataType::FLOAT, }; tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES); b.dense(input, - /*outDim=*/14, + /*outDim=*/14_n, /*activation=*/Activation::RELU, /*use_bias=*/true, /*data_type=*/DataType::FLOAT, diff --git a/lib/pcg/test/src/pcg/computation_graph_builder.cc b/lib/pcg/test/src/pcg/computation_graph_builder.cc index e7fa853be9..98a4e2a241 100644 --- a/lib/pcg/test/src/pcg/computation_graph_builder.cc +++ b/lib/pcg/test/src/pcg/computation_graph_builder.cc @@ -8,22 +8,22 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ComputationGraphBuilder") { ComputationGraphBuilder b; - size_t batch_size = 2; + nonnegative_int batch_size = 2_n; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, 3, 10, 10}}, + TensorDims{FFOrdered{batch_size, 3_n, 10_n, 10_n}}, DataType::FLOAT, }; tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES); tensor_guid_t output = b.conv2d(input, - /*outChannels=*/5, - /*kernelH=*/3, - /*kernelW=*/3, - /*strideH=*/1, - /*strideW=*/1, - /*paddingH=*/0, - /*paddingW=*/0); + /*outChannels=*/5_n, + /*kernelH=*/3_n, + /*kernelW=*/3_n, + /*strideH=*/1_n, + /*strideW=*/1_n, + /*paddingH=*/0_n, + /*paddingW=*/0_n); // ComputationGraph cg = b.computation_graph; // CHECK(get_layers(cg).size() == 1); } diff --git a/lib/pcg/test/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc b/lib/pcg/test/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc index 9068e14517..4102efd48e 100644 --- a/lib/pcg/test/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc +++ b/lib/pcg/test/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc @@ -9,11 +9,11 @@ TEST_SUITE(FF_TEST_SUITE) { V1BinarySeriesSplit{ V1BinarySPDecomposition{ V1BinaryParallelSplit{ - V1BinarySPDecomposition{2}, - V1BinarySPDecomposition{2}, + V1BinarySPDecomposition{2_n}, + V1BinarySPDecomposition{2_n}, }, }, - V1BinarySPDecomposition{3}, + V1BinarySPDecomposition{3_n}, }, }; @@ -68,11 +68,11 @@ TEST_SUITE(FF_TEST_SUITE) { V1BinarySeriesSplit example_split = V1BinarySeriesSplit{ V1BinarySPDecomposition{ V1BinaryParallelSplit{ - V1BinarySPDecomposition{2}, - V1BinarySPDecomposition{2}, + V1BinarySPDecomposition{2_n}, + V1BinarySPDecomposition{2_n}, }, }, - V1BinarySPDecomposition{3}, + V1BinarySPDecomposition{3_n}, }; nlohmann::json example_json = { @@ -124,11 +124,11 @@ TEST_SUITE(FF_TEST_SUITE) { V1BinaryParallelSplit example_split = V1BinaryParallelSplit{ V1BinarySPDecomposition{ V1BinaryParallelSplit{ - V1BinarySPDecomposition{2}, - V1BinarySPDecomposition{2}, + V1BinarySPDecomposition{2_n}, + V1BinarySPDecomposition{2_n}, }, }, - V1BinarySPDecomposition{3}, + V1BinarySPDecomposition{3_n}, }; nlohmann::json example_json = { diff --git a/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc b/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc index 8336d81bb4..59c606adb1 100644 --- a/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc +++ b/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc @@ -10,15 +10,15 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 12, - 16, + TensorDims{FFOrdered{ + 12_n, + 16_n, }}, DataType::FLOAT, }; tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES); - tensor_guid_t mm_output = b.dense(input, 8); + tensor_guid_t mm_output = b.dense(input, 8_n); tensor_guid_t relu_output = b.relu(mm_output); return b.computation_graph; diff --git a/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc b/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc index 8ce25c4bc5..682cf2d798 100644 --- a/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc +++ b/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc @@ -12,19 +12,19 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{12, 2}, - ShardParallelDim{16, 1}, + ShardParallelDim{12_n, 2_n}, + ShardParallelDim{16_n, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, }; parallel_tensor_guid_t input = b.create_input_tensor(input_shape); - parallel_tensor_guid_t mm_output = b.dense(input, 8); + parallel_tensor_guid_t mm_output = b.dense(input, 8_n); parallel_tensor_guid_t relu_output = b.relu(mm_output); return b.pcg; diff --git a/lib/pcg/test/src/pcg/machine_specification.cc b/lib/pcg/test/src/pcg/machine_specification.cc index c183ae0d31..6d339350a0 100644 --- a/lib/pcg/test/src/pcg/machine_specification.cc +++ b/lib/pcg/test/src/pcg/machine_specification.cc @@ -7,11 +7,10 @@ using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("MachineSpecification") { - MachineSpecification ms = MachineSpecification{ - /*num_nodes=*/4, - /*num_cpus_per_node=*/16, - /*num_gpus_per_node=*/8, + /*num_nodes=*/4_n, + /*num_cpus_per_node=*/16_n, + /*num_gpus_per_node=*/8_n, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0, }; @@ -32,19 +31,19 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("get_device_id") { SUBCASE("valid MachineSpaceCoordinate") { MachineSpaceCoordinate coord = MachineSpaceCoordinate{ - /*node_idx=*/2, - /*device_idx=*/12, + /*node_idx=*/2_n, + /*device_idx=*/12_n, DeviceType::CPU, }; device_id_t correct = - device_id_from_index(2 * 16 + 12, DeviceType::CPU); + device_id_from_index(nonnegative_int{2 * 16 + 12}, DeviceType::CPU); device_id_t result = get_device_id(ms, coord); CHECK(correct == result); } SUBCASE("MachineSpaceCoordinate out of bounds for given machine spec") { MachineSpaceCoordinate coord = MachineSpaceCoordinate{ - /*node_idx=*/2, - /*device_idx=*/18, + /*node_idx=*/2_n, + /*device_idx=*/18_n, DeviceType::CPU, }; CHECK_THROWS(get_device_id(ms, coord)); diff --git a/lib/pcg/test/src/pcg/machine_view.cc b/lib/pcg/test/src/pcg/machine_view.cc index 3e9d48fac3..e286f08bf2 100644 --- a/lib/pcg/test/src/pcg/machine_view.cc +++ b/lib/pcg/test/src/pcg/machine_view.cc @@ -12,10 +12,10 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("MachineView - utility functions") { MachineView mv = MachineView{ MachineSpaceCoordinate{ - /*node_idx=*/0, /*device_idx=*/0, DeviceType::GPU}, - {MachineViewDimension{stride_t{2}, + /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU}, + {MachineViewDimension{stride_t{2_n}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{2}, + MachineViewDimension{stride_t{2_n}, MachineSpecificationDimension::INTER_NODE}}}; SUBCASE("num_dims") { @@ -43,48 +43,48 @@ TEST_SUITE(FF_TEST_SUITE) { * Where the (x,) are the `TaskSpaceCoordinate`s, and the underlying grid * is the machine space. */ - OperatorTaskSpace task = OperatorTaskSpace{{3}}; + OperatorTaskSpace task = OperatorTaskSpace{{3_n}}; MachineView mv = MachineView{ MachineSpaceCoordinate{ - /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU}, - {MachineViewDimension{stride_t{2}, + /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU}, + {MachineViewDimension{stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE}}}; MachineSpecification ms = - MachineSpecification{/*num_nodes=*/1, - /*num_cpus_per_node=*/6, - /*num_gpus_per_node=*/6, + MachineSpecification{/*num_nodes=*/1_n, + /*num_cpus_per_node=*/6_n, + /*num_gpus_per_node=*/6_n, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0}; SUBCASE("Task with TaskSpaceCoordinate = (0,)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n}}; MachineSpaceCoordinate correct = MachineSpaceCoordinate{ - /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU}; + /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU}; MachineSpaceCoordinate result = get_machine_space_coordinate(task, mv, coord, ms).value(); CHECK(correct == result); } SUBCASE("Task with TaskSpaceCoordinate = (1,)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n}}; MachineSpaceCoordinate correct = MachineSpaceCoordinate{ - /*node_idx=*/0, /*device_idx=*/3, DeviceType::GPU}; + /*node_idx=*/0_n, /*device_idx=*/3_n, DeviceType::GPU}; MachineSpaceCoordinate result = get_machine_space_coordinate(task, mv, coord, ms).value(); CHECK(correct == result); } SUBCASE("Task with TaskSpaceCoordinate = (2,)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{2}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{2_n}}; MachineSpaceCoordinate correct = MachineSpaceCoordinate{ - /*node_idx=*/0, /*device_idx=*/5, DeviceType::GPU}; + /*node_idx=*/0_n, /*device_idx=*/5_n, DeviceType::GPU}; MachineSpaceCoordinate result = get_machine_space_coordinate(task, mv, coord, ms).value(); CHECK(correct == result); } SUBCASE("TaskSpaceCoordinate is out of bounds") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{4}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{4_n}}; std::optional result = get_machine_space_coordinate(task, mv, coord, ms); std::optional correct = std::nullopt; @@ -112,52 +112,52 @@ TEST_SUITE(FF_TEST_SUITE) { * grid is the machine space. */ - OperatorTaskSpace task = OperatorTaskSpace{{2, 2}}; + OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}}; MachineView mv = MachineView{ MachineSpaceCoordinate{ - /*node_idx=*/1, /*device_idx=*/2, DeviceType::GPU}, - {MachineViewDimension{stride_t{1}, + /*node_idx=*/1_n, /*device_idx=*/2_n, DeviceType::GPU}, + {MachineViewDimension{stride_t{1_n}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{2}, + MachineViewDimension{stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE}}}; MachineSpecification ms = - MachineSpecification{/*num_nodes=*/3, - /*num_cpus_per_node=*/5, - /*num_gpus_per_node=*/5, + MachineSpecification{/*num_nodes=*/3_n, + /*num_cpus_per_node=*/5_n, + /*num_gpus_per_node=*/5_n, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0}; SUBCASE("Task with TaskSpaceCoordinate = (0,0)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 0}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 0_n}}; MachineSpaceCoordinate correct = MachineSpaceCoordinate{ - /*node_idx=*/1, /*device_idx=*/2, DeviceType::GPU}; + /*node_idx=*/1_n, /*device_idx=*/2_n, DeviceType::GPU}; MachineSpaceCoordinate result = get_machine_space_coordinate(task, mv, coord, ms).value(); CHECK(correct == result); } SUBCASE("Task with TaskSpaceCoordinate = (0,1)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 1}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 1_n}}; MachineSpaceCoordinate correct = MachineSpaceCoordinate{ - /*node_idx=*/1, /*device_idx=*/4, DeviceType::GPU}; + /*node_idx=*/1_n, /*device_idx=*/4_n, DeviceType::GPU}; MachineSpaceCoordinate result = get_machine_space_coordinate(task, mv, coord, ms).value(); CHECK(correct == result); } SUBCASE("Task with TaskSpaceCoordinate = (1,0)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 0}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 0_n}}; MachineSpaceCoordinate correct = MachineSpaceCoordinate{ - /*node_idx=*/2, /*device_idx=*/2, DeviceType::GPU}; + /*node_idx=*/2_n, /*device_idx=*/2_n, DeviceType::GPU}; MachineSpaceCoordinate result = get_machine_space_coordinate(task, mv, coord, ms).value(); CHECK(correct == result); } SUBCASE("Task with TaskSpaceCoordinate = (1,1)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 1}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 1_n}}; MachineSpaceCoordinate correct = MachineSpaceCoordinate{ - /*node_idx=*/2, /*device_idx=*/4, DeviceType::GPU}; + /*node_idx=*/2_n, /*device_idx=*/4_n, DeviceType::GPU}; MachineSpaceCoordinate result = get_machine_space_coordinate(task, mv, coord, ms).value(); CHECK(correct == result); @@ -179,52 +179,52 @@ TEST_SUITE(FF_TEST_SUITE) { * grid is the machine space. */ - OperatorTaskSpace task = OperatorTaskSpace{{2, 2}}; + OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}}; MachineView mv = MachineView{ MachineSpaceCoordinate{ - /*node_idx=*/1, /*device_idx=*/0, DeviceType::GPU}, - {MachineViewDimension{stride_t{1}, + /*node_idx=*/1_n, /*device_idx=*/0_n, DeviceType::GPU}, + {MachineViewDimension{stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE}, - MachineViewDimension{stride_t{2}, + MachineViewDimension{stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE}}}; MachineSpecification ms = - MachineSpecification{/*num_nodes=*/2, - /*num_cpus_per_node=*/6, - /*num_gpus_per_node=*/6, + MachineSpecification{/*num_nodes=*/2_n, + /*num_cpus_per_node=*/6_n, + /*num_gpus_per_node=*/6_n, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0}; SUBCASE("Task with TaskSpaceCoordinate = (0,0)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 0}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 0_n}}; MachineSpaceCoordinate correct = MachineSpaceCoordinate{ - /*node_idx=*/1, /*device_idx=*/0, DeviceType::GPU}; + /*node_idx=*/1_n, /*device_idx=*/0_n, DeviceType::GPU}; MachineSpaceCoordinate result = get_machine_space_coordinate(task, mv, coord, ms).value(); CHECK(correct == result); } SUBCASE("Task with TaskSpaceCoordinate = (0,1)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 1}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 1_n}}; MachineSpaceCoordinate correct = MachineSpaceCoordinate{ - /*node_idx=*/1, /*device_idx=*/4, DeviceType::GPU}; + /*node_idx=*/1_n, /*device_idx=*/4_n, DeviceType::GPU}; MachineSpaceCoordinate result = get_machine_space_coordinate(task, mv, coord, ms).value(); CHECK(correct == result); } SUBCASE("Task with TaskSpaceCoordinate = (1,0)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 0}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 0_n}}; MachineSpaceCoordinate correct = MachineSpaceCoordinate{ - /*node_idx=*/1, /*device_idx=*/1, DeviceType::GPU}; + /*node_idx=*/1_n, /*device_idx=*/1_n, DeviceType::GPU}; MachineSpaceCoordinate result = get_machine_space_coordinate(task, mv, coord, ms).value(); CHECK(correct == result); } SUBCASE("Task with TaskSpaceCoordinate = (1,1)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 1}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 1_n}}; MachineSpaceCoordinate correct = MachineSpaceCoordinate{ - /*node_idx=*/1, /*device_idx=*/5, DeviceType::GPU}; + /*node_idx=*/1_n, /*device_idx=*/5_n, DeviceType::GPU}; MachineSpaceCoordinate result = get_machine_space_coordinate(task, mv, coord, ms).value(); CHECK(correct == result); @@ -253,45 +253,45 @@ TEST_SUITE(FF_TEST_SUITE) { * grid is the machine space. */ - OperatorTaskSpace task = OperatorTaskSpace{{2, 2, 2}}; + OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n, 2_n}}; MachineView mv = MachineView{ MachineSpaceCoordinate{ - /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU}, - {MachineViewDimension{stride_t{1}, + /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU}, + {MachineViewDimension{stride_t{1_n}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{2}, + MachineViewDimension{stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE}, - MachineViewDimension{stride_t{1}, + MachineViewDimension{stride_t{1_n}, MachineSpecificationDimension::INTRA_NODE}}}; MachineSpecification ms = - MachineSpecification{/*num_nodes=*/2, - /*num_cpus_per_node=*/8, - /*num_gpus_per_node=*/8, + MachineSpecification{/*num_nodes=*/2_n, + /*num_cpus_per_node=*/8_n, + /*num_gpus_per_node=*/8_n, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0}; SUBCASE("Task with TaskSpaceCoordinate = (0,0,1)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 1, 0}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 1_n, 0_n}}; MachineSpaceCoordinate correct = MachineSpaceCoordinate{ - /*node_idx=*/0, /*device_idx=*/3, DeviceType::GPU}; + /*node_idx=*/0_n, /*device_idx=*/3_n, DeviceType::GPU}; MachineSpaceCoordinate result = get_machine_space_coordinate(task, mv, coord, ms).value(); CHECK(correct == result); } SUBCASE("Task with TaskSpaceCoordinate = (1,1,0)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 0, 1}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 0_n, 1_n}}; MachineSpaceCoordinate correct = MachineSpaceCoordinate{ - /*node_idx=*/1, /*device_idx=*/5, DeviceType::GPU}; + /*node_idx=*/1_n, /*device_idx=*/5_n, DeviceType::GPU}; MachineSpaceCoordinate result = get_machine_space_coordinate(task, mv, coord, ms).value(); CHECK(correct == result); } SUBCASE("Task with TaskSpaceCoordinate = (1,1,1)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 1, 1}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 1_n, 1_n}}; MachineSpaceCoordinate correct = MachineSpaceCoordinate{ - /*node_idx=*/1, /*device_idx=*/7, DeviceType::GPU}; + /*node_idx=*/1_n, /*device_idx=*/7_n, DeviceType::GPU}; MachineSpaceCoordinate result = get_machine_space_coordinate(task, mv, coord, ms).value(); CHECK(correct == result); @@ -319,23 +319,23 @@ TEST_SUITE(FF_TEST_SUITE) { * select */ MachineSpecification ms = - MachineSpecification{/*num_nodes=*/1, - /*num_cpus_per_node=*/6, - /*num_gpus_per_node=*/6, + MachineSpecification{/*num_nodes=*/1_n, + /*num_cpus_per_node=*/6_n, + /*num_gpus_per_node=*/6_n, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0}; - OperatorTaskSpace task = OperatorTaskSpace{{3}}; + OperatorTaskSpace task = OperatorTaskSpace{{3_n}}; MachineView mv = MachineView{ MachineSpaceCoordinate{ - /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU}, - {MachineViewDimension{stride_t{2}, + /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU}, + {MachineViewDimension{stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE}}}; std::unordered_set correct = { - device_id_t{gpu_id_t{1}}, - device_id_t{gpu_id_t{3}}, - device_id_t{gpu_id_t{5}}, + device_id_t{gpu_id_t{1_n}}, + device_id_t{gpu_id_t{3_n}}, + device_id_t{gpu_id_t{5_n}}, }; std::unordered_set result = get_device_ids(task, mv, ms); CHECK(result == correct); @@ -364,26 +364,26 @@ TEST_SUITE(FF_TEST_SUITE) { */ MachineSpecification ms = - MachineSpecification{/*num_nodes=*/3, - /*num_cpus_per_node=*/5, - /*num_gpus_per_node=*/5, + MachineSpecification{/*num_nodes=*/3_n, + /*num_cpus_per_node=*/5_n, + /*num_gpus_per_node=*/5_n, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0}; - OperatorTaskSpace task = OperatorTaskSpace{{2, 2}}; + OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}}; MachineView mv = MachineView{ MachineSpaceCoordinate{ - /*node_idx=*/1, /*device_idx=*/2, DeviceType::GPU}, - {MachineViewDimension{stride_t{1}, + /*node_idx=*/1_n, /*device_idx=*/2_n, DeviceType::GPU}, + {MachineViewDimension{stride_t{1_n}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{2}, + MachineViewDimension{stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE}}}; std::unordered_set correct = { - device_id_t{gpu_id_t{7}}, - device_id_t{gpu_id_t{9}}, - device_id_t{gpu_id_t{12}}, - device_id_t{gpu_id_t{14}}, + device_id_t{gpu_id_t{7_n}}, + device_id_t{gpu_id_t{9_n}}, + device_id_t{gpu_id_t{12_n}}, + device_id_t{gpu_id_t{14_n}}, }; std::unordered_set result = get_device_ids(task, mv, ms); CHECK(result == correct); diff --git a/lib/pcg/test/src/pcg/operator_task_space.cc b/lib/pcg/test/src/pcg/operator_task_space.cc index 13198d9456..fa06af3635 100644 --- a/lib/pcg/test/src/pcg/operator_task_space.cc +++ b/lib/pcg/test/src/pcg/operator_task_space.cc @@ -18,13 +18,13 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("OperatorTaskSpace has 2 dimensions") { - OperatorTaskSpace task = OperatorTaskSpace{{2, 2}}; + OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}}; std::unordered_set correct = {{ - TaskSpaceCoordinate{{0, 0}}, - TaskSpaceCoordinate{{0, 1}}, - TaskSpaceCoordinate{{1, 0}}, - TaskSpaceCoordinate{{1, 1}}, + TaskSpaceCoordinate{{0_n, 0_n}}, + TaskSpaceCoordinate{{0_n, 1_n}}, + TaskSpaceCoordinate{{1_n, 0_n}}, + TaskSpaceCoordinate{{1_n, 1_n}}, }}; std::unordered_set result = get_task_space_coordinates(task); @@ -32,13 +32,13 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("OperatorTaskSpace has 3 dimensions") { - OperatorTaskSpace task = OperatorTaskSpace{{1, 2, 2}}; + OperatorTaskSpace task = OperatorTaskSpace{{1_n, 2_n, 2_n}}; std::unordered_set correct = {{ - TaskSpaceCoordinate{{0, 0, 0}}, - TaskSpaceCoordinate{{0, 0, 1}}, - TaskSpaceCoordinate{{0, 1, 0}}, - TaskSpaceCoordinate{{0, 1, 1}}, + TaskSpaceCoordinate{{0_n, 0_n, 0_n}}, + TaskSpaceCoordinate{{0_n, 0_n, 1_n}}, + TaskSpaceCoordinate{{0_n, 1_n, 0_n}}, + TaskSpaceCoordinate{{0_n, 1_n, 1_n}}, }}; std::unordered_set result = get_task_space_coordinates(task); @@ -48,17 +48,17 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_task_space_maximum_coordinate") { SUBCASE("OperatorTaskSpace has 2 dimensions") { - OperatorTaskSpace task = OperatorTaskSpace{{3, 2}}; + OperatorTaskSpace task = OperatorTaskSpace{{3_n, 2_n}}; - TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2, 1}}; + TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2_n, 1_n}}; TaskSpaceCoordinate result = get_task_space_maximum_coordinate(task); CHECK(correct == result); } SUBCASE("OperatorTaskSpace has 3 dimensions") { - OperatorTaskSpace task = OperatorTaskSpace{{3, 2, 4}}; + OperatorTaskSpace task = OperatorTaskSpace{{3_n, 2_n, 4_n}}; - TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2, 1, 3}}; + TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2_n, 1_n, 3_n}}; TaskSpaceCoordinate result = get_task_space_maximum_coordinate(task); CHECK(correct == result); } diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc index dd8308561f..979a96d204 100644 --- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc +++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc @@ -45,12 +45,12 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{10, 2}, - ShardParallelDim{12, 1}, + ShardParallelDim{10_n, 2_n}, + ShardParallelDim{12_n, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t input = b.create_input_tensor(input_shape, CreateGrad::YES); b.dense(input, - /*outDim=*/14, + /*outDim=*/14_n, /*activation=*/Activation::RELU, /*use_bias=*/true, /*data_type=*/DataType::FLOAT, @@ -110,12 +110,12 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape tensor_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{10, 2}, - ShardParallelDim{12, 1}, + ShardParallelDim{10_n, 2_n}, + ShardParallelDim{12_n, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -186,12 +186,12 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{10, 2}, - ShardParallelDim{12, 1}, + ShardParallelDim{10_n, 2_n}, + ShardParallelDim{12_n, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -246,7 +246,7 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelComputationGraph pcg = empty_parallel_computation_graph(); LinearAttrs op_attrs = LinearAttrs{ - /*out_channels=*/14, + /*out_channels=*/14_n, /*use_bias=*/false, /*data_type=*/DataType::FLOAT, /*activation=*/Activation::RELU, @@ -293,7 +293,7 @@ TEST_SUITE(FF_TEST_SUITE) { {}, {raw_projection_tensor_attrs}); - ReplicateAttrs replicate_attrs = ReplicateAttrs{/*degree=*/2}; + ReplicateAttrs replicate_attrs = ReplicateAttrs{/*degree=*/2_n}; ParallelLayerAttrs replicate_layer_attrs = ParallelLayerAttrs{ PCGOperatorAttrs{replicate_attrs}, std::nullopt, @@ -346,12 +346,12 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape tensor_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{12, 2}, - ShardParallelDim{10, 1}, + ShardParallelDim{12_n, 2_n}, + ShardParallelDim{10_n, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{2}, - DiscardCopyDegree{2}, + SumDegree{2_n}, + DiscardCopyDegree{2_n}, }, }, DataType::FLOAT, diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc index 3f66b33b6e..ef3173d744 100644 --- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc +++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc @@ -26,18 +26,18 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ParallelComputationGraphBuilder::add") { ParallelComputationGraphBuilder b; - ShardParallelDim d1 = ShardParallelDim{10, 2}; - ShardParallelDim d2 = ShardParallelDim{15, 3}; + ShardParallelDim d1 = ShardParallelDim{10_n, 2_n}; + ShardParallelDim d2 = ShardParallelDim{15_n, 3_n}; ParallelTensorShape lhs_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{10, 2}, - ShardParallelDim{15, 3}, + ShardParallelDim{10_n, 2_n}, + ShardParallelDim{15_n, 3_n}, }, ReplicaParallelDimSet{ - SumDegree{2}, - DiscardCopyDegree{1}, + SumDegree{2_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -76,18 +76,18 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ParallelComputationGraphBuilder::batch_matmul") { ParallelComputationGraphBuilder b; - ShardParallelDim batch_dim = ShardParallelDim{4, 2}; + ShardParallelDim batch_dim = ShardParallelDim{4_n, 2_n}; ParallelTensorShape a_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ batch_dim, - ShardParallelDim{10, 1}, - ShardParallelDim{15, 3}, + ShardParallelDim{10_n, 1_n}, + ShardParallelDim{15_n, 3_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -97,12 +97,12 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorDims{ FFOrdered{ batch_dim, - ShardParallelDim{15, 3}, - ShardParallelDim{12, 1}, + ShardParallelDim{15_n, 3_n}, + ShardParallelDim{12_n, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -130,7 +130,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("op attrs") { PCGOperatorAttrs result = get_parallel_layer_attrs(b.pcg, layer).op_attrs; - PCGOperatorAttrs correct = PCGOperatorAttrs{BatchMatmulAttrs{-1, -1}}; + PCGOperatorAttrs correct = + PCGOperatorAttrs{BatchMatmulAttrs{std::nullopt, std::nullopt}}; CHECK(result == correct); } } @@ -141,12 +142,12 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{10, 2}, - ShardParallelDim{12, 1}, + ShardParallelDim{10_n, 2_n}, + ShardParallelDim{12_n, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{3}, - DiscardCopyDegree{1}, + SumDegree{3_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -179,28 +180,28 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ParallelComputationGraphBuilder::conv2d") { ParallelComputationGraphBuilder b; - size_t batch_size = 2; + nonnegative_int batch_size = 2_n; TensorShape unpar_input_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, 3, 10, 10}}, + TensorDims{FFOrdered{batch_size, 3_n, 10_n, 10_n}}, DataType::FLOAT, }; - ParallelTensorShape input_shape = - lift_to_parallel_with_degrees(unpar_input_shape, - SumDegree{1}, - DiscardCopyDegree{1}, - FFOrdered{2, 1, 1, 1}); + ParallelTensorShape input_shape = lift_to_parallel_with_degrees( + unpar_input_shape, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, + FFOrdered{2_n, 1_n, 1_n, 1_n}); parallel_tensor_guid_t input = b.create_input_tensor(input_shape); - int outChannels = 6; - int kernelH = 5; - int kernelW = 4; - int strideH = 3; - int strideW = 2; - int paddingH = 1; - int paddingW = 0; + nonnegative_int outChannels = 6_n; + nonnegative_int kernelH = 5_n; + nonnegative_int kernelW = 4_n; + nonnegative_int strideH = 3_n; + nonnegative_int strideW = 2_n; + nonnegative_int paddingH = 1_n; + nonnegative_int paddingW = 0_n; parallel_tensor_guid_t output = b.conv2d(input, /*outChannels=*/outChannels, /*kernelH=*/kernelH, @@ -254,7 +255,7 @@ TEST_SUITE(FF_TEST_SUITE) { strideW, paddingH, paddingW, - /*groups=*/1, + /*groups=*/1_n, /*activation=*/std::nullopt, /*use_bias=*/true, }; @@ -301,18 +302,18 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{10, 2}, - ShardParallelDim{16, 1}, + ShardParallelDim{10_n, 2_n}, + ShardParallelDim{16_n, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, }; - int outDim = 14; + nonnegative_int outDim = 14_n; parallel_tensor_guid_t input = b.create_input_tensor(input_shape); parallel_tensor_guid_t output = b.dense(input, @@ -341,8 +342,8 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ParallelComputationGraphBuilder::embedding") { ParallelComputationGraphBuilder b; - ShardParallelDim batch_dim = ShardParallelDim{12, 2}; - ShardParallelDim feature_dim = ShardParallelDim{10, 1}; + ShardParallelDim batch_dim = ShardParallelDim{12_n, 2_n}; + ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n}; ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ @@ -350,8 +351,8 @@ TEST_SUITE(FF_TEST_SUITE) { feature_dim, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::INT32, @@ -359,8 +360,8 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t input = b.create_input_tensor(input_shape); parallel_tensor_guid_t output = b.embedding(input, - /*num_entries=*/32, - /*outDim=*/8, + /*num_entries=*/32_n, + /*outDim=*/8_n, AggregateOp::SUM, DataType::FLOAT); parallel_layer_guid_t layer = get_source_layer(output); @@ -384,9 +385,9 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ParallelComputationGraphBuilder::multihead_attention") { ParallelComputationGraphBuilder b; - ShardParallelDim batch_dim = ShardParallelDim{12, 2}; - ShardParallelDim sequence_dim = ShardParallelDim{16, 1}; - ShardParallelDim feature_dim = ShardParallelDim{10, 1}; + ShardParallelDim batch_dim = ShardParallelDim{12_n, 2_n}; + ShardParallelDim sequence_dim = ShardParallelDim{16_n, 1_n}; + ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n}; ParallelTensorShape query_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ @@ -395,8 +396,8 @@ TEST_SUITE(FF_TEST_SUITE) { feature_dim, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -405,8 +406,8 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape key_shape = query_shape; ParallelTensorShape value_shape = query_shape; - int embed_dim = 8; - int num_heads = 6; + nonnegative_int embed_dim = 8_n; + nonnegative_int num_heads = 6_n; parallel_tensor_guid_t query = b.create_input_tensor(query_shape); parallel_tensor_guid_t key = b.create_input_tensor(key_shape); @@ -435,8 +436,8 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ParallelComputationGraphBuilder::relu") { ParallelComputationGraphBuilder b; - ShardParallelDim batch_dim = ShardParallelDim{18, 3}; - ShardParallelDim feature_dim = ShardParallelDim{32, 1}; + ShardParallelDim batch_dim = ShardParallelDim{18_n, 3_n}; + ShardParallelDim feature_dim = ShardParallelDim{32_n, 1_n}; ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ @@ -445,8 +446,8 @@ TEST_SUITE(FF_TEST_SUITE) { feature_dim, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -474,8 +475,8 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ParallelComputationGraphBuilder::parallel_partition") { ParallelComputationGraphBuilder b; - ShardParallelDim batch_dim = ShardParallelDim{18, 2}; - ShardParallelDim feature_dim = ShardParallelDim{10, 1}; + ShardParallelDim batch_dim = ShardParallelDim{18_n, 2_n}; + ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n}; ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ @@ -484,8 +485,8 @@ TEST_SUITE(FF_TEST_SUITE) { feature_dim, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -493,7 +494,7 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t input = b.create_input_tensor(input_shape); parallel_tensor_guid_t output = - b.parallel_partition(input, ff_dim_t{nonnegative_int{0}}, 2); + b.parallel_partition(input, ff_dim_t{nonnegative_int{0}}, 2_n); parallel_layer_guid_t layer = get_source_layer(output); SUBCASE("incoming") { @@ -514,8 +515,8 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ParallelComputationGraphBuilder::parallel_combine") { ParallelComputationGraphBuilder b; - ShardParallelDim batch_dim = ShardParallelDim{18, 2}; - ShardParallelDim feature_dim = ShardParallelDim{10, 1}; + ShardParallelDim batch_dim = ShardParallelDim{18_n, 2_n}; + ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n}; ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ @@ -524,8 +525,8 @@ TEST_SUITE(FF_TEST_SUITE) { feature_dim, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -533,7 +534,7 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t input = b.create_input_tensor(input_shape); parallel_tensor_guid_t output = - b.parallel_combine(input, ff_dim_t{nonnegative_int{0}}, 2); + b.parallel_combine(input, ff_dim_t{nonnegative_int{0}}, 2_n); parallel_layer_guid_t layer = get_source_layer(output); SUBCASE("incoming") { @@ -554,8 +555,8 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ParallelComputationGraphBuilder::parallel_replicate") { ParallelComputationGraphBuilder b; - ShardParallelDim batch_dim = ShardParallelDim{18, 2}; - ShardParallelDim feature_dim = ShardParallelDim{10, 1}; + ShardParallelDim batch_dim = ShardParallelDim{18_n, 2_n}; + ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n}; ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ @@ -564,15 +565,15 @@ TEST_SUITE(FF_TEST_SUITE) { feature_dim, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, }; parallel_tensor_guid_t input = b.create_input_tensor(input_shape); - parallel_tensor_guid_t output = b.parallel_replicate(input, 2); + parallel_tensor_guid_t output = b.parallel_replicate(input, 2_n); parallel_layer_guid_t layer = get_source_layer(output); SUBCASE("incoming") { @@ -593,8 +594,8 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ParallelComputationGraphBuilder::parallel_reduce") { ParallelComputationGraphBuilder b; - ShardParallelDim batch_dim = ShardParallelDim{18, 2}; - ShardParallelDim feature_dim = ShardParallelDim{10, 1}; + ShardParallelDim batch_dim = ShardParallelDim{18_n, 2_n}; + ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n}; ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ @@ -603,15 +604,15 @@ TEST_SUITE(FF_TEST_SUITE) { feature_dim, }, ReplicaParallelDimSet{ - SumDegree{4}, - DiscardCopyDegree{1}, + SumDegree{4_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, }; parallel_tensor_guid_t input = b.create_input_tensor(input_shape); - parallel_tensor_guid_t output = b.parallel_reduce(input, 2); + parallel_tensor_guid_t output = b.parallel_reduce(input, 2_n); parallel_layer_guid_t layer = get_source_layer(output); SUBCASE("incoming") { diff --git a/lib/pcg/test/src/pcg/start_invariant_machine_view.cc b/lib/pcg/test/src/pcg/start_invariant_machine_view.cc index 8383754aa2..71c4d1b1d0 100644 --- a/lib/pcg/test/src/pcg/start_invariant_machine_view.cc +++ b/lib/pcg/test/src/pcg/start_invariant_machine_view.cc @@ -8,15 +8,15 @@ using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("StartInvariantMachineView - utility functions") { StartInvariantMachineView simv = StartInvariantMachineView{ - {MachineViewDimension{stride_t{2}, + {MachineViewDimension{stride_t{2_n}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{2}, + MachineViewDimension{stride_t{2_n}, MachineSpecificationDimension::INTER_NODE}}, DeviceType::GPU}; SUBCASE("num_dims") { - int result = num_dims(simv); - int correct = 2; + nonnegative_int result = num_dims(simv); + nonnegative_int correct = 2_n; CHECK(result == correct); } @@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("get_strides") { std::vector result = get_strides(simv); - std::vector correct = {stride_t{2}, stride_t{2}}; + std::vector correct = {stride_t{2_n}, stride_t{2_n}}; CHECK(result == correct); } @@ -43,11 +43,11 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("StartInvariantMachineView - conversions") { MachineSpaceCoordinate start = - MachineSpaceCoordinate{1, 2, DeviceType::GPU}; + MachineSpaceCoordinate{1_n, 2_n, DeviceType::GPU}; std::vector dimensions = { - MachineViewDimension{stride_t{2}, + MachineViewDimension{stride_t{2_n}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{3}, + MachineViewDimension{stride_t{3_n}, MachineSpecificationDimension::INTRA_NODE}}; MachineView mv = MachineView{start, dimensions}; @@ -94,21 +94,21 @@ TEST_SUITE(FF_TEST_SUITE) { * | (0,) | | (1,) | | (2,) | | * +-------+-------+-------+-------+-------+-------+ */ - OperatorTaskSpace task = OperatorTaskSpace{{3}}; + OperatorTaskSpace task = OperatorTaskSpace{{3_n}}; StartInvariantMachineView simv = StartInvariantMachineView{ - {MachineViewDimension{stride_t{2}, + {MachineViewDimension{stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE}}, DeviceType::GPU}; MachineSpecification ms = - MachineSpecification{/*num_nodes=*/1, - /*num_cpus_per_node=*/6, - /*num_gpus_per_node=*/6, - /*inter_node_bandwidth=*/0, - /*intra_node_bandwidth=*/0}; + MachineSpecification{/*num_nodes=*/1_n, + /*num_cpus_per_node=*/6_n, + /*num_gpus_per_node=*/6_n, + /*inter_node_bandwidth=*/0.0, + /*intra_node_bandwidth=*/0.0}; SUBCASE("get_machine_space_offset") { SUBCASE("Task with TaskSpaceCoordinate = (0,)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n}}; MachineSpaceOffset correct = MachineSpaceOffset{0, 0, DeviceType::GPU}; MachineSpaceOffset result = @@ -117,7 +117,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Task with TaskSpaceCoordinate = (1,)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n}}; MachineSpaceOffset correct = MachineSpaceOffset{0, 2, DeviceType::GPU}; MachineSpaceOffset result = @@ -126,7 +126,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Task with TaskSpaceCoordinate = (2,)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{2}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{2_n}}; MachineSpaceOffset correct = MachineSpaceOffset{0, 4, DeviceType::GPU}; MachineSpaceOffset result = @@ -162,23 +162,23 @@ TEST_SUITE(FF_TEST_SUITE) { * +-------+-------+-------+-------+ */ - OperatorTaskSpace task = OperatorTaskSpace{{2, 2}}; + OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}}; StartInvariantMachineView simv = StartInvariantMachineView{ - {MachineViewDimension{stride_t{1}, + {MachineViewDimension{stride_t{1_n}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{2}, + MachineViewDimension{stride_t{2_n}, MachineSpecificationDimension::INTRA_NODE}}, DeviceType::GPU}; MachineSpecification ms = - MachineSpecification{/*num_nodes=*/2, - /*num_cpus_per_node=*/4, - /*num_gpus_per_node=*/4, + MachineSpecification{/*num_nodes=*/2_n, + /*num_cpus_per_node=*/4_n, + /*num_gpus_per_node=*/4_n, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0}; SUBCASE("get_machine_space_offset") { SUBCASE("Task with TaskSpaceCoordinate = (0,0)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 0}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 0_n}}; MachineSpaceOffset correct = MachineSpaceOffset{0, 0, DeviceType::GPU}; MachineSpaceOffset result = @@ -187,7 +187,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Task with TaskSpaceCoordinate = (0,1)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 1}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 1_n}}; MachineSpaceOffset correct = MachineSpaceOffset{0, 2, DeviceType::GPU}; MachineSpaceOffset result = @@ -196,7 +196,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Task with TaskSpaceCoordinate = (1,0)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 0}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 0_n}}; MachineSpaceOffset correct = MachineSpaceOffset{1, 0, DeviceType::GPU}; MachineSpaceOffset result = @@ -205,7 +205,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Task with TaskSpaceCoordinate = (1,1)") { - TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 1}}; + TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 1_n}}; MachineSpaceOffset correct = MachineSpaceOffset{1, 2, DeviceType::GPU}; MachineSpaceOffset result = diff --git a/lib/substitutions/include/substitutions/apply_substitution/apply_substitution.h b/lib/substitutions/include/substitutions/apply_substitution/apply_substitution.h new file mode 100644 index 0000000000..92f7bb1c03 --- /dev/null +++ b/lib/substitutions/include/substitutions/apply_substitution/apply_substitution.h @@ -0,0 +1,31 @@ +#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_H +#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_H + +#include "substitutions/pcg_pattern_match.dtg.h" +#include "substitutions/sub_parallel_computation_graph.dtg.h" +#include "substitutions/substitution.dtg.h" + +namespace FlexFlow { + +/** + * @brief Applies \p substitution to \p sub_pcg at the location specified by \p + * match, returning the resulting SubParallelComputationGraph + * + * @param sub_pcg + * @param substitution + * @param match The location at which to apply substitution. This location in + * sub_pcg should match substitution's PCGPattern. Likely created by running + * FlexFlow::find_pattern_matches(PCGPattern const &, + * SubParallelComputationGraph const &). + * @return SubParallelComputationGraph A sub-PCG similar to sub_pcg, but with + * the subgraph specified by match replaced with the result of the output + * expression of substitution + */ +SubParallelComputationGraph + apply_substitution(SubParallelComputationGraph const &sub_pcg, + Substitution const &substitution, + PCGPatternMatch const &match); + +} // namespace FlexFlow + +#endif diff --git a/lib/substitutions/include/substitutions/substitution_internal/evaluate_substitution_output.h b/lib/substitutions/include/substitutions/apply_substitution/evaluate_substitution_output.h similarity index 76% rename from lib/substitutions/include/substitutions/substitution_internal/evaluate_substitution_output.h rename to lib/substitutions/include/substitutions/apply_substitution/evaluate_substitution_output.h index a0461b075b..74089c5aab 100644 --- a/lib/substitutions/include/substitutions/substitution_internal/evaluate_substitution_output.h +++ b/lib/substitutions/include/substitutions/apply_substitution/evaluate_substitution_output.h @@ -1,10 +1,10 @@ -#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_EVALUATE_SUBSTITUTION_OUTPUT_H -#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_EVALUATE_SUBSTITUTION_OUTPUT_H +#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_EVALUATE_SUBSTITUTION_OUTPUT_H +#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_EVALUATE_SUBSTITUTION_OUTPUT_H +#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.dtg.h" #include "substitutions/pcg_pattern_match.dtg.h" #include "substitutions/sub_parallel_computation_graph.dtg.h" #include "substitutions/substitution.dtg.h" -#include "substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.dtg.h" #include namespace FlexFlow { diff --git a/lib/substitutions/include/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.h b/lib/substitutions/include/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h similarity index 62% rename from lib/substitutions/include/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.h rename to lib/substitutions/include/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h index 603cb670bf..cd7e782909 100644 --- a/lib/substitutions/include/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.h +++ b/lib/substitutions/include/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h @@ -1,11 +1,11 @@ -#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_OUTPUT_EXPR_TO_RESULT_SUB_PCG_MAPPING_H -#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_OUTPUT_EXPR_TO_RESULT_SUB_PCG_MAPPING_H +#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_OUTPUT_EXPR_TO_RESULT_SUB_PCG_MAPPING_H +#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_OUTPUT_EXPR_TO_RESULT_SUB_PCG_MAPPING_H #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h" +#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.dtg.h" #include "substitutions/output_graph/output_graph_expr.dtg.h" #include "substitutions/output_graph/output_graph_expr_node_output.dtg.h" #include "substitutions/sub_parallel_computation_graph.dtg.h" -#include "substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.dtg.h" namespace FlexFlow { diff --git a/lib/substitutions/include/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.struct.toml b/lib/substitutions/include/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.struct.toml similarity index 100% rename from lib/substitutions/include/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.struct.toml rename to lib/substitutions/include/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.struct.toml diff --git a/lib/substitutions/include/substitutions/substitution_internal/perform_shape_inference.h b/lib/substitutions/include/substitutions/apply_substitution/perform_shape_inference.h similarity index 85% rename from lib/substitutions/include/substitutions/substitution_internal/perform_shape_inference.h rename to lib/substitutions/include/substitutions/apply_substitution/perform_shape_inference.h index b7ce13db0e..c3f9eff349 100644 --- a/lib/substitutions/include/substitutions/substitution_internal/perform_shape_inference.h +++ b/lib/substitutions/include/substitutions/apply_substitution/perform_shape_inference.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_PERFORM_SHAPE_INFERENCE_H -#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_PERFORM_SHAPE_INFERENCE_H +#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_PERFORM_SHAPE_INFERENCE_H +#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_PERFORM_SHAPE_INFERENCE_H #include "op-attrs/parallel_tensor_shape.dtg.h" #include "pcg/parallel_computation_graph/parallel_layer_attrs.dtg.h" diff --git a/lib/substitutions/include/substitutions/constraint_type.enum.toml b/lib/substitutions/include/substitutions/constraint_type.enum.toml index 8646ba1c83..f366a17725 100644 --- a/lib/substitutions/include/substitutions/constraint_type.enum.toml +++ b/lib/substitutions/include/substitutions/constraint_type.enum.toml @@ -9,3 +9,6 @@ features = [ [[values]] name = "EQUAL" + +[[values]] +name = "DIVISIBLE_BY" diff --git a/lib/substitutions/include/substitutions/operator_pattern/get_attribute_map.h b/lib/substitutions/include/substitutions/operator_pattern/get_attribute_map.h new file mode 100644 index 0000000000..2b31dada04 --- /dev/null +++ b/lib/substitutions/include/substitutions/operator_pattern/get_attribute_map.h @@ -0,0 +1,15 @@ +#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OPERATOR_PATTERN_GET_ATTRIBUTE_MAP_H +#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OPERATOR_PATTERN_GET_ATTRIBUTE_MAP_H + +#include "op-attrs/pcg_operator_attrs.dtg.h" +#include "substitutions/operator_pattern/operator_attribute_key.dtg.h" +#include "substitutions/operator_pattern/operator_attribute_value.dtg.h" + +namespace FlexFlow { + +std::unordered_map + get_attribute_map(PCGOperatorAttrs const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_constraint.h b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_constraint.h index 4affdd697f..c2c11fac51 100644 --- a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_constraint.h +++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_constraint.h @@ -9,6 +9,8 @@ OperatorAttributeConstraint op_type_equals_constraint(OperatorType); OperatorAttributeConstraint op_attr_key_equals(OperatorAttributeKey, OperatorAttributeValue const &); +OperatorAttributeConstraint + op_attr_key_divisible_by(OperatorAttributeKey, nonnegative_int denominator); OperatorAttributeConstraint make_equals_constraint(OperatorAttributeExpr const &, OperatorAttributeValue const &); diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.enum.toml b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.enum.toml index eb758ea4fc..af3666d46f 100644 --- a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.enum.toml +++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.enum.toml @@ -56,6 +56,7 @@ values = [ { name = "SHOULD_BROADCAST_RHS" }, { name = "DIM" }, { name = "AFFINE" }, + { name = "ELEMENTWISE_AFFINE" }, { name = "MOMENTUM" }, { name = "REGULARIZER" }, { name = "SHAPE" }, diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.h b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.h new file mode 100644 index 0000000000..d46403a847 --- /dev/null +++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.h @@ -0,0 +1,12 @@ +#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OPERATOR_PATTERN_OPERATOR_ATTRIBUTE_KEY_H +#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OPERATOR_PATTERN_OPERATOR_ATTRIBUTE_KEY_H + +#include "substitutions/operator_pattern/operator_attribute_key.dtg.h" + +namespace FlexFlow { + +std::vector all_operator_attribute_keys(); + +} // namespace FlexFlow + +#endif diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_list_access.struct.toml b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_list_access.struct.toml index bceff393d2..4ed226907e 100644 --- a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_list_access.struct.toml +++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_list_access.struct.toml @@ -10,7 +10,8 @@ features = [ ] includes = [ - "substitutions/operator_pattern/operator_attribute_key.dtg.h" + "substitutions/operator_pattern/operator_attribute_key.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] @@ -19,4 +20,4 @@ type = "::FlexFlow::OperatorAttributeKey" [[fields]] name = "index" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml index 8fe4a9494d..3312b292a0 100644 --- a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml +++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml @@ -7,7 +7,6 @@ features = [ "fmt", "json", ] -explicit_constructors = false includes = [ "", @@ -21,6 +20,7 @@ includes = [ "op-attrs/tensor_shape.dtg.h", "op-attrs/datatype.dtg.h", "", + "utils/nonnegative_int/nonnegative_int.h", ] src_includes = [ @@ -31,7 +31,7 @@ src_includes = [ ] [[values]] -type = "int" +type = "::FlexFlow::nonnegative_int" [[values]] type = "bool" @@ -40,7 +40,10 @@ type = "bool" type = "float" [[values]] -type = "std::vector" +type = "std::optional" + +[[values]] +type = "std::vector<::FlexFlow::nonnegative_int>" [[values]] type = "std::vector<::FlexFlow::ff_dim_t>" @@ -55,10 +58,7 @@ type = "std::optional<::FlexFlow::Activation>" type = "::FlexFlow::ff_dim_t" [[values]] -type = "size_t" - -[[values]] -type = "::FlexFlow::AggregateOp" +type = "std::optional<::FlexFlow::AggregateOp>" [[values]] type = "std::optional<::FlexFlow::RegularizerAttrs>" diff --git a/lib/substitutions/include/substitutions/output_graph/output_graph_expr.h b/lib/substitutions/include/substitutions/output_graph/output_graph_expr.h index e550767292..8c047fc44d 100644 --- a/lib/substitutions/include/substitutions/output_graph/output_graph_expr.h +++ b/lib/substitutions/include/substitutions/output_graph/output_graph_expr.h @@ -2,14 +2,19 @@ #define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OUTPUT_GRAPH_OUTPUT_GRAPH_EXPR_H #include "substitutions/output_graph/output_graph_expr.dtg.h" +#include "substitutions/output_graph/output_graph_expr_input.dtg.h" #include "substitutions/output_graph/output_graph_expr_node.dtg.h" #include "substitutions/output_graph/output_graph_expr_node_output.dtg.h" namespace FlexFlow { +std::unordered_set get_nodes(OutputGraphExpr const &); + std::vector get_node_outputs(OutputGraphExpr const &, OutputGraphExprNode const &); +std::unordered_set get_inputs(OutputGraphExpr const &); + } // namespace FlexFlow #endif diff --git a/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.h b/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.h new file mode 100644 index 0000000000..e172edb025 --- /dev/null +++ b/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OUTPUT_GRAPH_OUTPUT_GRAPH_EXPR_VALUE_H +#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OUTPUT_GRAPH_OUTPUT_GRAPH_EXPR_VALUE_H + +#include "substitutions/output_graph/output_graph_expr_value.dtg.h" +#include "utils/graph/open_dataflow_graph/open_dataflow_value.dtg.h" + +namespace FlexFlow { + +OpenDataflowValue raw_open_dataflow_value_from_output_graph_expr_value( + OutputGraphExprValue const &); +OutputGraphExprValue output_graph_expr_value_from_raw_open_dataflow_value( + OpenDataflowValue const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.variant.toml b/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.variant.toml new file mode 100644 index 0000000000..641250e1f0 --- /dev/null +++ b/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.variant.toml @@ -0,0 +1,19 @@ +namespace = "FlexFlow" +name = "OutputGraphExprValue" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + +includes = [ + "substitutions/output_graph/output_graph_expr_input.dtg.h", + "substitutions/output_graph/output_graph_expr_node_output.dtg.h", +] + +[[values]] +type = "::FlexFlow::OutputGraphExprNodeOutput" + +[[values]] +type = "::FlexFlow::OutputGraphExprInput" diff --git a/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.h b/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.h index 60540c0711..0921569d62 100644 --- a/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.h +++ b/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.h @@ -20,6 +20,9 @@ std::pair set_attr_to_constant(OperatorAttributeKey key, OperatorAttributeValue const &value); +std::pair + set_op_type_attr(OperatorType); + } // namespace FlexFlow #endif diff --git a/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.struct.toml b/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.struct.toml index d712ea96f7..483f27791a 100644 --- a/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.struct.toml +++ b/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.struct.toml @@ -18,11 +18,12 @@ includes = [ src_includes = [ "utils/hash/unordered_map.h", "utils/fmt/unordered_map.h", + "utils/fmt/optional.h", ] -# [[fields]] -# name = "clone_operator" -# type = "std::optional" +[[fields]] +name = "template_operator" +type = "std::optional<::FlexFlow::PatternNode>" # NOTE(@wmdi): Not sure if it aligns with other design. Or alternatively we can # define the assignment for each operator type. diff --git a/lib/substitutions/include/substitutions/pcg_pattern.h b/lib/substitutions/include/substitutions/pcg_pattern.h index 7342e8169f..f0962b15c2 100644 --- a/lib/substitutions/include/substitutions/pcg_pattern.h +++ b/lib/substitutions/include/substitutions/pcg_pattern.h @@ -10,6 +10,8 @@ namespace FlexFlow { +std::unordered_set get_nodes(PCGPattern const &); + /** * @brief Find all locations in \p pcg that match \p pattern */ diff --git a/lib/substitutions/include/substitutions/pcg_pattern_match.h b/lib/substitutions/include/substitutions/pcg_pattern_match.h index 388377d70c..b946173422 100644 --- a/lib/substitutions/include/substitutions/pcg_pattern_match.h +++ b/lib/substitutions/include/substitutions/pcg_pattern_match.h @@ -6,7 +6,7 @@ #include "substitutions/pcg_pattern_match.dtg.h" #include "substitutions/sub_parallel_computation_graph.dtg.h" #include "substitutions/unlabelled/pattern_node_output.dtg.h" -#include "substitutions/unlabelled/unlabelled_dataflow_graph_pattern_match.dtg.h" +#include "substitutions/unlabelled/unlabelled_dataflow_graph_pattern_match.h" namespace FlexFlow { @@ -17,7 +17,7 @@ bidict SubParallelComputationGraph const &spcg); UnlabelledDataflowGraphPatternMatch - get_unlabelled_pattern_match(PCGPatternMatch const &); + get_unlabelled_pattern_match(PCGPatternMatch const &match); } // namespace FlexFlow diff --git a/lib/substitutions/include/substitutions/sub_parallel_computation_graph_edge.h b/lib/substitutions/include/substitutions/sub_parallel_computation_graph_edge.h index 15cbb6127c..c0544abe1b 100644 --- a/lib/substitutions/include/substitutions/sub_parallel_computation_graph_edge.h +++ b/lib/substitutions/include/substitutions/sub_parallel_computation_graph_edge.h @@ -12,7 +12,7 @@ namespace FlexFlow { SubParallelComputationGraphEdge subpcg_edge_from_tensor_and_dst(parallel_tensor_guid_t const &tensor, parallel_layer_guid_t const &layer, - int input_idx); + nonnegative_int input_idx); SubParallelComputationGraphEdge subpcg_edge_from_tensor_and_use(open_parallel_tensor_guid_t const &tensor, parallel_tensor_use_t const &use); diff --git a/lib/substitutions/include/substitutions/substitution.h b/lib/substitutions/include/substitutions/substitution.h index 7b4e5e6912..7dc4e714ab 100644 --- a/lib/substitutions/include/substitutions/substitution.h +++ b/lib/substitutions/include/substitutions/substitution.h @@ -1,12 +1,14 @@ #ifndef _FLEXFLOW_SUBSTITUTIONS_SUBSTITUTION_H #define _FLEXFLOW_SUBSTITUTIONS_SUBSTITUTION_H -#include "substitutions/pcg_pattern_match.dtg.h" -#include "substitutions/sub_parallel_computation_graph.dtg.h" #include "substitutions/substitution.dtg.h" namespace FlexFlow { +bool is_isomorphic_to(Substitution const &, Substitution const &); + +std::string as_dot(Substitution const &); + /** * @brief Checks that all internal invariants of the given substitution hold * @@ -22,25 +24,6 @@ namespace FlexFlow { */ bool is_valid_substitution(Substitution const &); -/** - * @brief Applies \p substitution to \p sub_pcg at the location specified by \p - * match, returning the resulting SubParallelComputationGraph - * - * @param sub_pcg - * @param substitution - * @param match The location at which to apply substitution. This location in - * sub_pcg should match substitution's PCGPattern. Likely created by running - * FlexFlow::find_pattern_matches(PCGPattern const &, - * SubParallelComputationGraph const &). - * @return SubParallelComputationGraph A sub-PCG similar to sub_pcg, but with - * the subgraph specified by match replaced with the result of the output - * expression of substitution - */ -SubParallelComputationGraph - apply_substitution(SubParallelComputationGraph const &sub_pcg, - Substitution const &substitution, - PCGPatternMatch const &match); - } // namespace FlexFlow #endif diff --git a/lib/substitutions/include/substitutions/substitution_builder.h b/lib/substitutions/include/substitutions/substitution_builder.h new file mode 100644 index 0000000000..1548b2269b --- /dev/null +++ b/lib/substitutions/include/substitutions/substitution_builder.h @@ -0,0 +1,49 @@ +#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_BUILDER_H +#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_BUILDER_H + +#include "substitutions/output_graph/output_graph_expr_value.dtg.h" +#include "substitutions/substitution.dtg.h" +#include "substitutions/unlabelled/pattern_value.dtg.h" +#include + +namespace FlexFlow { + +struct SubstitutionBuilder { +public: + SubstitutionBuilder(); + + std::pair + add_input(TensorAttributePattern const &, + std::optional const &name = std::nullopt); + void equate_outputs(PatternValue const &, OutputGraphExprValue const &); + + std::vector add_pattern_node( + OperatorAttributePattern const &node_pattern, + std::vector const &inputs, + std::vector const &output_patterns, + std::optional const &name = std::nullopt); + + std::vector + add_output_graph_node(OutputOperatorAttrsAssignment const &node_expr, + std::vector const &inputs, + nonnegative_int num_outputs); + + PatternNode pattern_node_named(std::string const &) const; + PatternInput pattern_input_named(std::string const &) const; + + Substitution get_substitution() const; + +private: + LabelledOpenDataflowGraph + pattern_g; + LabelledOpenDataflowGraph + output_g; + bidict input_mapping; + bidict pattern_node_names; + bidict pattern_input_names; + bidict output_mapping; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_list_access.struct.toml b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_list_access.struct.toml index a57dd25845..71e11a12d5 100644 --- a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_list_access.struct.toml +++ b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_list_access.struct.toml @@ -10,7 +10,8 @@ features = [ ] includes = [ - "substitutions/tensor_pattern/tensor_attribute_key.dtg.h" + "substitutions/tensor_pattern/tensor_attribute_key.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] @@ -19,4 +20,4 @@ type = "::FlexFlow::TensorAttributeKey" [[fields]] name = "index" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_pattern.h b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_pattern.h index 5b7ebf4ef8..c1e28f8d8f 100644 --- a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_pattern.h +++ b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_pattern.h @@ -2,10 +2,13 @@ #define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_TENSOR_PATTERN_TENSOR_ATTRIBUTE_PATTERN_H #include "substitutions/tensor_pattern/tensor_attribute_pattern.dtg.h" +#include "utils/nonnegative_int/nonnegative_int.h" namespace FlexFlow { TensorAttributePattern tensor_attribute_pattern_match_all(); +TensorAttributePattern + tensor_attr_pattern_require_num_dims(nonnegative_int num_dims); } // namespace FlexFlow diff --git a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_value.variant.toml b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_value.variant.toml index 46b703a7fc..d2b931fb2d 100644 --- a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_value.variant.toml +++ b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_value.variant.toml @@ -12,10 +12,11 @@ includes = [ "", "utils/hash/vector.h", "utils/fmt/vector.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[values]] -type = "size_t" +type = "::FlexFlow::nonnegative_int" [[values]] -type = "std::vector" +type = "std::vector<::FlexFlow::nonnegative_int>" diff --git a/lib/substitutions/include/substitutions/unity_substitution_set.h b/lib/substitutions/include/substitutions/unity_substitution_set.h new file mode 100644 index 0000000000..183f76ac8a --- /dev/null +++ b/lib/substitutions/include/substitutions/unity_substitution_set.h @@ -0,0 +1,47 @@ +#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_UNITY_SUBSTITUTION_SET_H +#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_UNITY_SUBSTITUTION_SET_H + +#include "pcg/machine_specification.dtg.h" +#include "substitutions/substitution.dtg.h" +#include "utils/fmt/vector.h" + +namespace FlexFlow { + +std::vector + get_substitution_set(MachineSpecification const &resources); + +Substitution create_combine_inception(nonnegative_int num_convs, + nonnegative_int num_dims, + nonnegative_int degree); +Substitution create_combine_concat(nonnegative_int num_inputs, + nonnegative_int num_dims, + nonnegative_int degree); +Substitution create_replicate_linear_combine(nonnegative_int num_dims, + nonnegative_int degree, + bool use_bias); +Substitution create_partition_linear_combine(nonnegative_int num_dims, + nonnegative_int degree, + Activation activation, + bool use_bias); +Substitution create_partition_conv2d_combine(nonnegative_int num_dims, + nonnegative_int degree); +Substitution create_partition_attention_combine(nonnegative_int num_heads, + nonnegative_int degree); +Substitution create_replicate_attention_reduce(nonnegative_int num_heads, + nonnegative_int degree); +Substitution create_partition_add_combine(ff_dim_t parallel_dim, + nonnegative_int degree); +Substitution create_partition_relu_combine(ff_dim_t parallel_dim, + nonnegative_int degree); +Substitution create_partition_concat_combine(nonnegative_int num_inputs, + ff_dim_t concat_dim, + ff_dim_t parallel_dim, + nonnegative_int degree); +Substitution create_partition_softmax_combine(ff_dim_t softmax_dim, + ff_dim_t partition_dim, + nonnegative_int degree); +Substitution create_fuse_linear_activation(Activation activation); + +} // namespace FlexFlow + +#endif diff --git a/lib/substitutions/include/substitutions/unlabelled/input_pattern_edge.h b/lib/substitutions/include/substitutions/unlabelled/input_pattern_edge.h index 7a7c9c3c28..8c58cb991c 100644 --- a/lib/substitutions/include/substitutions/unlabelled/input_pattern_edge.h +++ b/lib/substitutions/include/substitutions/unlabelled/input_pattern_edge.h @@ -9,7 +9,7 @@ namespace FlexFlow { PatternInput get_src_input(InputPatternEdge const &); PatternNode get_dst_node(InputPatternEdge const &); -int get_dst_idx(InputPatternEdge const &); +nonnegative_int get_dst_idx(InputPatternEdge const &); } // namespace FlexFlow diff --git a/lib/substitutions/include/substitutions/unlabelled/pattern_matching.h b/lib/substitutions/include/substitutions/unlabelled/pattern_matching.h index 14c0b9ddcc..ce30b18f55 100644 --- a/lib/substitutions/include/substitutions/unlabelled/pattern_matching.h +++ b/lib/substitutions/include/substitutions/unlabelled/pattern_matching.h @@ -9,13 +9,10 @@ namespace FlexFlow { -// OpenDataflowGraphView apply_match(UnlabelledGraphPattern const &pattern, -// UnlabelledDataflowGraphPatternMatch const -// &match); - OpenDataflowSubgraphResult subgraph_matched(OpenDataflowGraphView const &graph, UnlabelledDataflowGraphPatternMatch const &match); + bool pattern_matches_subgraph_under( UnlabelledGraphPattern const &pattern, OpenDataflowGraphView const &subgraph, @@ -30,11 +27,6 @@ bool unlabelled_pattern_does_match( UnlabelledDataflowGraphPatternMatch const &match, MatchAdditionalCriterion const &additional_criterion); -std::vector - find_pattern_matches(UnlabelledGraphPattern const &pattern, - OpenDataflowGraphView const &graph, - MatchAdditionalCriterion const &additional_criterion); - } // namespace FlexFlow #endif diff --git a/lib/substitutions/include/substitutions/unlabelled/pattern_node_output.h b/lib/substitutions/include/substitutions/unlabelled/pattern_node_output.h index 3dd5b262c9..67f513b8b1 100644 --- a/lib/substitutions/include/substitutions/unlabelled/pattern_node_output.h +++ b/lib/substitutions/include/substitutions/unlabelled/pattern_node_output.h @@ -6,7 +6,7 @@ namespace FlexFlow { PatternNode get_src_node(PatternNodeOutput const &); -int get_idx(PatternNodeOutput const &); +nonnegative_int get_idx(PatternNodeOutput const &); } // namespace FlexFlow diff --git a/lib/substitutions/include/substitutions/unlabelled/standard_pattern_edge.h b/lib/substitutions/include/substitutions/unlabelled/standard_pattern_edge.h index 7316098fb5..817e829709 100644 --- a/lib/substitutions/include/substitutions/unlabelled/standard_pattern_edge.h +++ b/lib/substitutions/include/substitutions/unlabelled/standard_pattern_edge.h @@ -8,8 +8,8 @@ namespace FlexFlow { PatternNode get_src_node(StandardPatternEdge const &); PatternNode get_dst_node(StandardPatternEdge const &); -int get_src_idx(StandardPatternEdge const &); -int get_dst_idx(StandardPatternEdge const &); +nonnegative_int get_src_idx(StandardPatternEdge const &); +nonnegative_int get_dst_idx(StandardPatternEdge const &); } // namespace FlexFlow diff --git a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc new file mode 100644 index 0000000000..61bfe15d7b --- /dev/null +++ b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc @@ -0,0 +1,165 @@ +#include "substitutions/apply_substitution/apply_substitution.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h" +#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" +#include "substitutions/apply_substitution/evaluate_substitution_output.h" +#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h" +#include "substitutions/open_parallel_tensor_guid_t.h" +#include "substitutions/pcg_pattern_match.h" +#include "substitutions/sub_parallel_computation_graph.h" +#include "substitutions/sub_parallel_computation_graph_data.dtg.h" +#include "substitutions/sub_parallel_computation_graph_edge.h" +#include "utils/containers/keys.h" +#include "utils/containers/merge_maps.h" +#include "utils/containers/restrict_keys.h" +#include "utils/containers/set_minus.h" +#include "utils/containers/values.h" + +namespace FlexFlow { + +SubParallelComputationGraph + apply_substitution(SubParallelComputationGraph const &spcg, + Substitution const &sub, + PCGPatternMatch const &match) { + auto substitution_output_result = + evaluate_substitution_output(spcg, sub, match); + SubParallelComputationGraph substitution_output_graph = + substitution_output_result.first; + OutputExprToResultSubPCGMapping output_expr_to_result_sub_pcg_mapping = + substitution_output_result.second; + + SubParallelComputationGraphData output_graph_data = + get_sub_pcg_data(substitution_output_graph); + SubParallelComputationGraphData pre_data = get_sub_pcg_data(spcg); + + std::unordered_set pre_nodes = + keys(pre_data.node_data); + std::unordered_set matched_nodes = + unordered_set_of(values(match.node_assignment)); + std::unordered_set post_nodes_from_original_graph = + set_minus(pre_nodes, matched_nodes); + + std::unordered_map post_node_data = + [&] { + std::unordered_map + post_node_data_from_orig = restrict_keys( + pre_data.node_data, post_nodes_from_original_graph); + std::unordered_map + post_node_data_from_sub = output_graph_data.node_data; + + return merge_disjoint_maps(post_node_data_from_orig, + post_node_data_from_sub); + }(); + + std::unordered_set post_edges = [&] { + std::unordered_set post_edges_from_orig = + filter(pre_data.edges, [&](SubParallelComputationGraphEdge const &e) { + if (e.raw_edge.has()) { + return true; + } else { + DataflowEdge dfe = e.raw_edge.get(); + parallel_layer_guid_t src = parallel_layer_guid_t{dfe.src.node}; + parallel_layer_guid_t dst = parallel_layer_guid_t{dfe.dst.node}; + return !(contains(matched_nodes, src) || + contains(matched_nodes, dst)); + } + }); + + std::unordered_set post_edges_from_sub = + filter(output_graph_data.edges, + [&](SubParallelComputationGraphEdge const &e) { + return !e.raw_edge.has(); + }); + + bidict + output_orig_pattern_mapping = get_output_mapping_for_pcg_pattern_match( + match, sub.pcg_pattern, spcg); + bidict + output_post_outexpr_mapping = get_output_graph_expr_output_mapping( + output_expr_to_result_sub_pcg_mapping, + sub.output_graph_expr, + substitution_output_graph); + + std::unordered_set incoming_to_sub_edges; + for (auto const &[pattern_input, base_graph_tensor] : + match.input_assignment) { + OutputGraphExprInput output_expr_input = + sub.inputs_mapping.at_l(pattern_input); + input_parallel_tensor_guid_t output_graph_input = + output_expr_to_result_sub_pcg_mapping.input_mapping.at_r( + output_expr_input); + std::unordered_set uses = get_parallel_tensor_uses( + substitution_output_graph, + open_parallel_tensor_guid_from_input(output_graph_input)); + for (parallel_tensor_use_t const &use : uses) { + SubParallelComputationGraphEdge new_edge = + subpcg_edge_from_tensor_and_use(base_graph_tensor, use); + incoming_to_sub_edges.insert(new_edge); + } + } + + std::unordered_set outgoing_from_sub_edges; + for (ParallelComputationGraphEdge const &outgoing_edge : + get_subgraph_outgoing_edges(spcg, matched_nodes)) { + parallel_tensor_guid_t original_tensor = + get_parallel_tensor(outgoing_edge); + PatternNodeOutput pattern_tensor = + output_orig_pattern_mapping.at_r(original_tensor); + OutputGraphExprNodeOutput output_graph_tensor = + sub.outputs_mapping.at_l(pattern_tensor); + parallel_tensor_guid_t new_tensor = + output_post_outexpr_mapping.at_r(output_graph_tensor); + + SubParallelComputationGraphEdge new_edge = + subpcg_edge_from_tensor_and_dst( + new_tensor, + get_dst_layer(outgoing_edge), + get_dst_layer_input_idx(outgoing_edge)); + outgoing_from_sub_edges.insert(new_edge); + } + + return set_union(std::vector{ + post_edges_from_orig, + post_edges_from_sub, + incoming_to_sub_edges, + outgoing_from_sub_edges, + }); + }(); + + std::unordered_set post_inputs = + pre_data.inputs; + + std::unordered_map + post_value_data = [&] { + std::unordered_map + post_value_data_from_orig = filter_keys( + pre_data.value_data, [&](open_parallel_tensor_guid_t const &t) { + return visit_open_parallel_tensor_guid( + t, + overload{ + [&](parallel_tensor_guid_t const &t) { + return contains(post_nodes_from_original_graph, + get_source_layer(t)); + }, + [](input_parallel_tensor_guid_t const &) { + return true; + }, + }); + }); + + std::unordered_map + post_value_data_from_sub = output_graph_data.value_data; + return merge_disjoint_maps(post_value_data_from_orig, + post_value_data_from_sub); + }(); + + SubParallelComputationGraphData post_data = SubParallelComputationGraphData{ + post_node_data, + post_edges, + post_inputs, + post_value_data, + }; + + return sub_pcg_from_graph_data(post_data); +} + +} // namespace FlexFlow diff --git a/lib/substitutions/src/substitutions/substitution_internal/evaluate_substitution_output.cc b/lib/substitutions/src/substitutions/apply_substitution/evaluate_substitution_output.cc similarity index 96% rename from lib/substitutions/src/substitutions/substitution_internal/evaluate_substitution_output.cc rename to lib/substitutions/src/substitutions/apply_substitution/evaluate_substitution_output.cc index 186e2fc03a..a921201c3a 100644 --- a/lib/substitutions/src/substitutions/substitution_internal/evaluate_substitution_output.cc +++ b/lib/substitutions/src/substitutions/apply_substitution/evaluate_substitution_output.cc @@ -1,7 +1,7 @@ -#include "substitutions/substitution_internal/evaluate_substitution_output.h" +#include "substitutions/apply_substitution/evaluate_substitution_output.h" +#include "substitutions/apply_substitution/perform_shape_inference.h" #include "substitutions/output_graph/output_operator_attrs_assignment.h" #include "substitutions/sub_parallel_computation_graph.h" -#include "substitutions/substitution_internal/perform_shape_inference.h" #include "utils/containers/map_keys.h" #include "utils/containers/map_values.h" #include "utils/graph/labelled_open_dataflow_graph/algorithms/permute_input_ids.h" diff --git a/lib/substitutions/src/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.cc b/lib/substitutions/src/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.cc similarity index 93% rename from lib/substitutions/src/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.cc rename to lib/substitutions/src/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.cc index 22e6a9f333..a5fc9a2e06 100644 --- a/lib/substitutions/src/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.cc +++ b/lib/substitutions/src/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.cc @@ -1,4 +1,4 @@ -#include "substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.h" +#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h" #include "substitutions/output_graph/output_graph_expr.h" #include "substitutions/sub_parallel_computation_graph.h" #include "utils/bidict/algorithms/bidict_from_keys_and_values.h" diff --git a/lib/substitutions/src/substitutions/substitution_internal/perform_shape_inference.cc b/lib/substitutions/src/substitutions/apply_substitution/perform_shape_inference.cc similarity index 95% rename from lib/substitutions/src/substitutions/substitution_internal/perform_shape_inference.cc rename to lib/substitutions/src/substitutions/apply_substitution/perform_shape_inference.cc index 9fa91d75b7..f49c7e0a3e 100644 --- a/lib/substitutions/src/substitutions/substitution_internal/perform_shape_inference.cc +++ b/lib/substitutions/src/substitutions/apply_substitution/perform_shape_inference.cc @@ -1,4 +1,4 @@ -#include "substitutions/substitution_internal/perform_shape_inference.h" +#include "substitutions/apply_substitution/perform_shape_inference.h" #include "op-attrs/get_output_shapes.h" #include "utils/containers/map_keys.h" #include "utils/containers/transform.h" diff --git a/lib/substitutions/src/substitutions/operator_pattern/eval_list_access.cc b/lib/substitutions/src/substitutions/operator_pattern/eval_list_access.cc index 53973dc1cb..6f41772a9e 100644 --- a/lib/substitutions/src/substitutions/operator_pattern/eval_list_access.cc +++ b/lib/substitutions/src/substitutions/operator_pattern/eval_list_access.cc @@ -1,5 +1,8 @@ #include "substitutions/operator_pattern/eval_list_access.h" #include "substitutions/operator_pattern/get_attribute.h" +#include "utils/containers/at_idx.h" +#include "utils/containers/make.h" +#include "utils/containers/transform.h" #include "utils/overload.h" namespace FlexFlow { @@ -18,20 +21,12 @@ std::optional [&](auto const &v) -> std::optional { using T = std::decay_t; - if constexpr (std::is_same_v>) { - if (acc.index >= v.size()) { - return std::nullopt; - } else { - int value = v.at(acc.index); - return OperatorAttributeValue{value}; - } + if constexpr (std::is_same_v>) { + return transform(at_idx(v, acc.index), + make()); } else if constexpr (std::is_same_v>) { - if (acc.index >= v.size()) { - return std::nullopt; - } else { - ff_dim_t value = v.at(acc.index); - return OperatorAttributeValue{value}; - } + return transform(at_idx(v, acc.index), + make()); } else { throw mk_runtime_error("Invalid operand"); } diff --git a/lib/substitutions/src/substitutions/operator_pattern/eval_list_size.cc b/lib/substitutions/src/substitutions/operator_pattern/eval_list_size.cc index a3ae9c84d1..fb0fd7f47b 100644 --- a/lib/substitutions/src/substitutions/operator_pattern/eval_list_size.cc +++ b/lib/substitutions/src/substitutions/operator_pattern/eval_list_size.cc @@ -1,5 +1,6 @@ #include "substitutions/operator_pattern/eval_list_size.h" #include "substitutions/operator_pattern/get_attribute.h" +#include "utils/nonnegative_int/num_elements.h" #include "utils/overload.h" namespace FlexFlow { @@ -18,9 +19,9 @@ std::optional [&](auto const &v) -> std::optional { using T = std::decay_t; - if constexpr (std::is_same_v> || + if constexpr (std::is_same_v> || std::is_same_v>) { - size_t size = v.size(); + nonnegative_int size = num_elements(v); return OperatorAttributeValue{size}; } else { throw mk_runtime_error("Invalid operand"); diff --git a/lib/substitutions/src/substitutions/operator_pattern/get_attribute.cc b/lib/substitutions/src/substitutions/operator_pattern/get_attribute.cc index 442d3345a1..cb733e16ff 100644 --- a/lib/substitutions/src/substitutions/operator_pattern/get_attribute.cc +++ b/lib/substitutions/src/substitutions/operator_pattern/get_attribute.cc @@ -8,7 +8,7 @@ std::optional get_attribute(BatchMatmulAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; default: return std::nullopt; } @@ -18,13 +18,13 @@ std::optional get_attribute(BatchNormAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::EPSILON: - return p.eps; + return OperatorAttributeValue{p.eps}; case OperatorAttributeKey::AFFINE: - return p.affine; + return OperatorAttributeValue{p.affine}; case OperatorAttributeKey::MOMENTUM: - return p.momentum; + return OperatorAttributeValue{p.momentum}; default: return std::nullopt; } @@ -34,9 +34,9 @@ std::optional get_attribute(BroadcastAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::TARGET_DIMS: - return p.target_dims; + return OperatorAttributeValue{p.target_dims}; default: return std::nullopt; } @@ -46,9 +46,9 @@ std::optional get_attribute(CastAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::DATA_TYPE: - return p.dtype; + return OperatorAttributeValue{p.dtype}; default: return std::nullopt; } @@ -58,11 +58,11 @@ std::optional get_attribute(CombineAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::PARALLEL_OP_DIM: - return p.combine_dim; + return OperatorAttributeValue{p.combine_dim}; case OperatorAttributeKey::PARALLEL_DIM: - return p.combine_degree; + return OperatorAttributeValue{p.combine_degree}; default: return std::nullopt; } @@ -72,9 +72,9 @@ std::optional get_attribute(ConcatAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::AXIS: - return p.axis; + return OperatorAttributeValue{p.axis}; default: return std::nullopt; } @@ -84,25 +84,25 @@ std::optional get_attribute(Conv2DAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::KERNEL_H: - return p.kernel_h; + return OperatorAttributeValue{p.kernel_h}; case OperatorAttributeKey::KERNEL_W: - return p.kernel_w; + return OperatorAttributeValue{p.kernel_w}; case OperatorAttributeKey::STRIDE_H: - return p.stride_h; + return OperatorAttributeValue{p.stride_h}; case OperatorAttributeKey::STRIDE_W: - return p.stride_w; + return OperatorAttributeValue{p.stride_w}; case OperatorAttributeKey::PADDING_H: - return p.padding_h; + return OperatorAttributeValue{p.padding_h}; case OperatorAttributeKey::PADDING_W: - return p.padding_w; + return OperatorAttributeValue{p.padding_w}; case OperatorAttributeKey::GROUPS: - return p.groups; + return OperatorAttributeValue{p.groups}; case OperatorAttributeKey::ACTIVATION: - return p.activation; + return OperatorAttributeValue{p.activation}; case OperatorAttributeKey::USE_BIAS: - return p.use_bias; + return OperatorAttributeValue{p.use_bias}; default: return std::nullopt; } @@ -112,7 +112,7 @@ std::optional get_attribute(ElementBinaryAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; default: return std::nullopt; } @@ -122,7 +122,7 @@ std::optional get_attribute(ElementUnaryAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; default: return std::nullopt; } @@ -132,7 +132,7 @@ std::optional get_attribute(DropoutAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; default: return std::nullopt; } @@ -142,15 +142,15 @@ std::optional get_attribute(EmbeddingAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::DATA_TYPE: - return p.data_type; + return OperatorAttributeValue{p.data_type}; case OperatorAttributeKey::AGGR: - return p.aggr; + return OperatorAttributeValue{p.aggr}; case OperatorAttributeKey::NUM_ENTRIES: - return p.num_entries; + return OperatorAttributeValue{p.num_entries}; case OperatorAttributeKey::OUT_CHANNELS: - return p.out_channels; + return OperatorAttributeValue{p.out_channels}; default: return std::nullopt; } @@ -160,7 +160,7 @@ std::optional get_attribute(FlatAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; default: return std::nullopt; } @@ -170,9 +170,9 @@ std::optional get_attribute(GatherAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::AXIS: - return p.dim; + return OperatorAttributeValue{p.dim}; default: return std::nullopt; } @@ -182,7 +182,7 @@ std::optional get_attribute(InputAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; default: return std::nullopt; } @@ -192,11 +192,11 @@ std::optional get_attribute(LayerNormAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::AFFINE: - return p.elementwise_affine; + return OperatorAttributeValue{p.elementwise_affine}; case OperatorAttributeKey::AXES: - return vector_of(p.axes); + return OperatorAttributeValue{vector_of(p.axes)}; default: return std::nullopt; } @@ -206,17 +206,17 @@ std::optional get_attribute(LinearAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::OUT_CHANNELS: - return p.out_channels; + return OperatorAttributeValue{p.out_channels}; case OperatorAttributeKey::USE_BIAS: - return p.use_bias; + return OperatorAttributeValue{p.use_bias}; case OperatorAttributeKey::DATA_TYPE: - return p.data_type; + return OperatorAttributeValue{p.data_type}; case OperatorAttributeKey::ACTIVATION: - return p.activation; + return OperatorAttributeValue{p.activation}; case OperatorAttributeKey::REGULARIZER: - return p.regularizer; + return OperatorAttributeValue{p.regularizer}; default: return std::nullopt; } @@ -226,13 +226,13 @@ std::optional get_attribute(MultiHeadAttentionAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::NUM_HEADS: - return p.num_heads; + return OperatorAttributeValue{p.num_heads}; case OperatorAttributeKey::USE_BIAS: - return p.bias; + return OperatorAttributeValue{p.bias}; case OperatorAttributeKey::DROPOUT: - return p.dropout; + return OperatorAttributeValue{p.dropout}; default: return std::nullopt; } @@ -242,7 +242,7 @@ std::optional get_attribute(NoopAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; default: return std::nullopt; } @@ -252,23 +252,23 @@ std::optional get_attribute(Pool2DAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::KERNEL_H: - return p.kernel_h; + return OperatorAttributeValue{p.kernel_h}; case OperatorAttributeKey::KERNEL_W: - return p.kernel_w; + return OperatorAttributeValue{p.kernel_w}; case OperatorAttributeKey::STRIDE_H: - return p.stride_h; + return OperatorAttributeValue{p.stride_h}; case OperatorAttributeKey::STRIDE_W: - return p.stride_w; + return OperatorAttributeValue{p.stride_w}; case OperatorAttributeKey::PADDING_H: - return p.padding_h; + return OperatorAttributeValue{p.padding_h}; case OperatorAttributeKey::PADDING_W: - return p.padding_w; + return OperatorAttributeValue{p.padding_w}; case OperatorAttributeKey::POOL_TYPE: - return p.pool_type; + return OperatorAttributeValue{p.pool_type}; case OperatorAttributeKey::ACTIVATION: - return std::optional{p.activation}; + return OperatorAttributeValue{p.activation}; default: return std::nullopt; } @@ -278,7 +278,7 @@ std::optional get_attribute(ReduceAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; default: return std::nullopt; } @@ -288,9 +288,9 @@ std::optional get_attribute(ReductionAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::PARALLEL_OP_DEGREE: - return p.reduction_degree; + return OperatorAttributeValue{p.reduction_degree}; default: return std::nullopt; } @@ -300,11 +300,11 @@ std::optional get_attribute(RepartitionAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::PARALLEL_OP_DIM: - return p.repartition_dim; + return OperatorAttributeValue{p.repartition_dim}; case OperatorAttributeKey::PARALLEL_OP_DEGREE: - return p.repartition_degree; + return OperatorAttributeValue{p.repartition_degree}; default: return std::nullopt; } @@ -314,9 +314,9 @@ std::optional get_attribute(ReplicateAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::PARALLEL_OP_DEGREE: - return p.replicate_degree; + return OperatorAttributeValue{p.replicate_degree}; default: return std::nullopt; } @@ -326,7 +326,7 @@ std::optional get_attribute(ReshapeAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; default: return std::nullopt; } @@ -336,9 +336,9 @@ std::optional get_attribute(ReverseAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::AXIS: - return p.axis; + return OperatorAttributeValue{p.axis}; default: return std::nullopt; } @@ -348,9 +348,9 @@ std::optional get_attribute(SplitAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::AXIS: - return p.axis; + return OperatorAttributeValue{p.axis}; default: return std::nullopt; } @@ -360,9 +360,9 @@ std::optional get_attribute(SoftmaxAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::AXIS: - return p.dim; + return OperatorAttributeValue{p.dim}; default: return std::nullopt; } @@ -372,7 +372,7 @@ std::optional get_attribute(TopKAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; default: return std::nullopt; } @@ -382,9 +382,9 @@ std::optional get_attribute(TransposeAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; case OperatorAttributeKey::PERMUTATION: - return vector_of(p.perm); + return OperatorAttributeValue{vector_of(p.perm)}; default: return std::nullopt; } @@ -394,7 +394,7 @@ std::optional get_attribute(WeightAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::OP_TYPE: - return get_op_type(p); + return OperatorAttributeValue{get_op_type(p)}; default: return std::nullopt; } diff --git a/lib/substitutions/src/substitutions/operator_pattern/get_attribute_map.cc b/lib/substitutions/src/substitutions/operator_pattern/get_attribute_map.cc new file mode 100644 index 0000000000..f1b7440aed --- /dev/null +++ b/lib/substitutions/src/substitutions/operator_pattern/get_attribute_map.cc @@ -0,0 +1,25 @@ +#include "substitutions/operator_pattern/get_attribute_map.h" +#include "substitutions/operator_pattern/get_attribute.h" +#include "substitutions/operator_pattern/operator_attribute_key.dtg.h" +#include "substitutions/operator_pattern/operator_attribute_key.h" +#include "substitutions/operator_pattern/operator_attribute_value.dtg.h" + +namespace FlexFlow { + +std::unordered_map + get_attribute_map(PCGOperatorAttrs const &op_attrs) { + std::unordered_map result; + + for (OperatorAttributeKey const &attr_key : all_operator_attribute_keys()) { + std::optional attr_value = + get_attribute(op_attrs, attr_key); + + if (attr_value.has_value()) { + result.insert({attr_key, attr_value.value()}); + } + } + + return result; +} + +} // namespace FlexFlow diff --git a/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_constraint.cc b/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_constraint.cc index 5ab528ed3d..29aef07e3a 100644 --- a/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_constraint.cc +++ b/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_constraint.cc @@ -20,6 +20,16 @@ OperatorAttributeConstraint }; } +OperatorAttributeConstraint + op_attr_key_divisible_by(OperatorAttributeKey key, + nonnegative_int denominator) { + return OperatorAttributeConstraint{ + ConstraintType::DIVISIBLE_BY, + OperatorAttributeExpr{key}, + OperatorAttributeValue{denominator}, + }; +} + OperatorAttributeConstraint make_equals_constraint(OperatorAttributeExpr const &expr, OperatorAttributeValue const &val) { diff --git a/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_key.cc b/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_key.cc new file mode 100644 index 0000000000..232d2c2f12 --- /dev/null +++ b/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_key.cc @@ -0,0 +1,68 @@ +#include "substitutions/operator_pattern/operator_attribute_key.h" + +namespace FlexFlow { + +// This should probably be integrated into proj, +// tracked in https://github.com/flexflow/FlexFlow/issues/1478 +std::vector all_operator_attribute_keys() { + return { + OperatorAttributeKey::OP_TYPE, + OperatorAttributeKey::USE_BIAS, + OperatorAttributeKey::GROUPS, + OperatorAttributeKey::POOL_TYPE, + OperatorAttributeKey::KERNEL_H, + OperatorAttributeKey::KERNEL_W, + OperatorAttributeKey::DATA_TYPE, + OperatorAttributeKey::SCALAR, + OperatorAttributeKey::STRIDE_H, + OperatorAttributeKey::STRIDE_W, + OperatorAttributeKey::PADDING_H, + OperatorAttributeKey::PADDING_W, + OperatorAttributeKey::AGGR, + OperatorAttributeKey::NUM_ENTRIES, + OperatorAttributeKey::OUT_CHANNELS, + OperatorAttributeKey::ACTIVATION, + OperatorAttributeKey::NUMDIM, + OperatorAttributeKey::AXIS, + OperatorAttributeKey::PERMUTATION, + OperatorAttributeKey::OUTSHUFFLE, + OperatorAttributeKey::MERGE_GCONV_COUNT, + OperatorAttributeKey::AXES, + OperatorAttributeKey::KEEP_DIMS, + OperatorAttributeKey::EPSILON, + OperatorAttributeKey::PARALLEL_OP_DIM, + OperatorAttributeKey::PARALLEL_OP_DEGREE, + OperatorAttributeKey::SOFTMAX_DIM, + OperatorAttributeKey::NUM_HEADS, + OperatorAttributeKey::PARALLEL_DIM, + OperatorAttributeKey::PARALLEL_DEGREE, + OperatorAttributeKey::PAD, + OperatorAttributeKey::EMBED_DIM, + OperatorAttributeKey::KDIM, + OperatorAttributeKey::VDIM, + OperatorAttributeKey::DROPOUT, + OperatorAttributeKey::BIAS, + OperatorAttributeKey::ADD_BIAS_KV, + OperatorAttributeKey::ADD_ZERO_ATTN, + OperatorAttributeKey::A_SEQ_LENGTH_DIM, + OperatorAttributeKey::B_SEQ_LENGTH_DIM, + OperatorAttributeKey::RELU, + OperatorAttributeKey::TARGET_DIMS, + OperatorAttributeKey::RATE, + OperatorAttributeKey::SEED, + OperatorAttributeKey::SHOULD_BROADCAST_LHS, + OperatorAttributeKey::SHOULD_BROADCAST_RHS, + OperatorAttributeKey::DIM, + OperatorAttributeKey::ELEMENTWISE_AFFINE, + OperatorAttributeKey::REGULARIZER, + OperatorAttributeKey::SHAPE, + OperatorAttributeKey::SPLITS, + OperatorAttributeKey::K, + OperatorAttributeKey::SORTED, + OperatorAttributeKey::COMBINE_DIM, + OperatorAttributeKey::COMBINE_DEGREE, + OperatorAttributeKey::NUM_INPUTS, + }; +} + +} // namespace FlexFlow diff --git a/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc b/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc index 7d65f687c8..4f11b343f8 100644 --- a/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc +++ b/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc @@ -33,10 +33,12 @@ PCGOperatorAttrs materialize_operator_from_attrs_map( switch (op_type) { case OperatorType::MULTIHEAD_ATTENTION: return PCGOperatorAttrs{MultiHeadAttentionAttrs{ - /*embed_dim=*/acc.get(OperatorAttributeKey::EMBED_DIM), - /*num_heads=*/acc.get(OperatorAttributeKey::NUM_HEADS), - /*kdim=*/acc.get(OperatorAttributeKey::KDIM), - /*vdim=*/acc.get(OperatorAttributeKey::VDIM), + /*embed_dim=*/acc.get( + OperatorAttributeKey::EMBED_DIM), + /*num_heads=*/ + acc.get(OperatorAttributeKey::NUM_HEADS), + /*kdim=*/acc.get(OperatorAttributeKey::KDIM), + /*vdim=*/acc.get(OperatorAttributeKey::VDIM), /*dropout=*/acc.get(OperatorAttributeKey::DROPOUT), /*bias=*/acc.get(OperatorAttributeKey::BIAS), /*add_bias_kv=*/acc.get(OperatorAttributeKey::ADD_BIAS_KV), @@ -44,12 +46,14 @@ PCGOperatorAttrs materialize_operator_from_attrs_map( }}; case OperatorType::POOL2D: return PCGOperatorAttrs{Pool2DAttrs{ - /*kernel_h=*/acc.get(OperatorAttributeKey::KERNEL_H), - /*kernel_w=*/acc.get(OperatorAttributeKey::KERNEL_W), - /*stride_h=*/acc.get(OperatorAttributeKey::STRIDE_H), - /*stride_w=*/acc.get(OperatorAttributeKey::STRIDE_W), - /*padding_h=*/acc.get(OperatorAttributeKey::PADDING_H), - /*padding_w=*/acc.get(OperatorAttributeKey::PADDING_W), + /*kernel_h=*/acc.get(OperatorAttributeKey::KERNEL_H), + /*kernel_w=*/acc.get(OperatorAttributeKey::KERNEL_W), + /*stride_h=*/acc.get(OperatorAttributeKey::STRIDE_H), + /*stride_w=*/acc.get(OperatorAttributeKey::STRIDE_W), + /*padding_h=*/ + acc.get(OperatorAttributeKey::PADDING_H), + /*padding_w=*/ + acc.get(OperatorAttributeKey::PADDING_W), /*pool_type=*/acc.get(OperatorAttributeKey::POOL_TYPE), /*activation=*/ acc.get>(OperatorAttributeKey::ACTIVATION) @@ -62,7 +66,8 @@ PCGOperatorAttrs materialize_operator_from_attrs_map( case OperatorType::DROPOUT: case OperatorType::LINEAR: return PCGOperatorAttrs{LinearAttrs{ - /*out_channels=*/acc.get(OperatorAttributeKey::OUT_CHANNELS), + /*out_channels=*/acc.get( + OperatorAttributeKey::OUT_CHANNELS), /*use_bias=*/acc.get(OperatorAttributeKey::USE_BIAS), /*data_type=*/acc.get(OperatorAttributeKey::DATA_TYPE), /*activation=*/ diff --git a/lib/substitutions/src/substitutions/output_graph/output_graph_expr.cc b/lib/substitutions/src/substitutions/output_graph/output_graph_expr.cc index 3d6aadc795..f6d1410a07 100644 --- a/lib/substitutions/src/substitutions/output_graph/output_graph_expr.cc +++ b/lib/substitutions/src/substitutions/output_graph/output_graph_expr.cc @@ -1,9 +1,18 @@ #include "substitutions/output_graph/output_graph_expr.h" #include "utils/containers/transform.h" #include "utils/graph/dataflow_graph/algorithms.h" +#include "utils/graph/node/algorithms.h" +#include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.h" namespace FlexFlow { +std::unordered_set get_nodes(OutputGraphExpr const &g) { + std::unordered_set raw_nodes = get_nodes(g.raw_graph); + + return transform(raw_nodes, + [](Node const &n) { return OutputGraphExprNode{n}; }); +} + std::vector get_node_outputs(OutputGraphExpr const &g, OutputGraphExprNode const &n) { std::vector raw_outputs = @@ -14,4 +23,13 @@ std::vector }); } +std::unordered_set get_inputs(OutputGraphExpr const &g) { + std::unordered_set raw_inputs = + get_open_dataflow_graph_inputs(g.raw_graph); + + return transform(raw_inputs, [](DataflowGraphInput const &i) { + return OutputGraphExprInput{i}; + }); +} + } // namespace FlexFlow diff --git a/lib/substitutions/src/substitutions/output_graph/output_graph_expr_value.cc b/lib/substitutions/src/substitutions/output_graph/output_graph_expr_value.cc new file mode 100644 index 0000000000..b35f3bbeae --- /dev/null +++ b/lib/substitutions/src/substitutions/output_graph/output_graph_expr_value.cc @@ -0,0 +1,30 @@ +#include "substitutions/output_graph/output_graph_expr_value.h" +#include "utils/overload.h" + +namespace FlexFlow { + +OpenDataflowValue raw_open_dataflow_value_from_output_graph_expr_value( + OutputGraphExprValue const &v) { + return v.visit(overload{ + [](OutputGraphExprNodeOutput const &o) { + return OpenDataflowValue{o.raw_dataflow_output}; + }, + [](OutputGraphExprInput const &i) { + return OpenDataflowValue{i.raw_dataflow_graph_input}; + }, + }); +} + +OutputGraphExprValue output_graph_expr_value_from_raw_open_dataflow_value( + OpenDataflowValue const &v) { + return v.visit(overload{ + [](DataflowOutput const &o) { + return OutputGraphExprValue{OutputGraphExprNodeOutput{o}}; + }, + [](DataflowGraphInput const &i) { + return OutputGraphExprValue{OutputGraphExprInput{i}}; + }, + }); +} + +} // namespace FlexFlow diff --git a/lib/substitutions/src/substitutions/output_graph/output_operator_attrs_assignment.cc b/lib/substitutions/src/substitutions/output_graph/output_operator_attrs_assignment.cc index fa247cd151..f6b90ef054 100644 --- a/lib/substitutions/src/substitutions/output_graph/output_operator_attrs_assignment.cc +++ b/lib/substitutions/src/substitutions/output_graph/output_operator_attrs_assignment.cc @@ -1,7 +1,9 @@ #include "substitutions/output_graph/output_operator_attrs_assignment.h" +#include "substitutions/operator_pattern/get_attribute_map.h" #include "substitutions/output_graph/materialize_operator_from_attrs_map.h" #include "substitutions/output_graph/output_operator_attribute_expr.h" #include "utils/containers/map_values.h" +#include "utils/containers/merge_maps.h" namespace FlexFlow { @@ -12,14 +14,31 @@ OutputOperatorAttrsAssignment output_operator_clone_node(PatternNode const &) { PCGOperatorAttrs materialize_output_operator_from_attrs_assignment( OutputOperatorAttrsAssignment const &attrs_assignment, std::unordered_map const &node_match) { - std::unordered_map attr_map = - map_values(attrs_assignment.assignments, - [&](OutputOperatorAttributeExpr const &expr) { - return evaluate_output_operator_attribute_expr(expr, - node_match); - }); - - return materialize_operator_from_attrs_map(attr_map); + + std::unordered_map + template_attrs_map = [&]() + -> std::unordered_map { + if (attrs_assignment.template_operator.has_value()) { + PatternNode template_node = attrs_assignment.template_operator.value(); + PCGOperatorAttrs template_op_attrs = node_match.at(template_node); + return get_attribute_map(template_op_attrs); + } else { + return {}; + } + }(); + + std::unordered_map + assignments_attrs_map = map_values( + attrs_assignment.assignments, + [&](OutputOperatorAttributeExpr const &expr) { + return evaluate_output_operator_attribute_expr(expr, node_match); + }); + + std::unordered_map + joined_attrs_map = + merge_map_right_dominates(template_attrs_map, assignments_attrs_map); + + return materialize_operator_from_attrs_map(joined_attrs_map); } std::pair @@ -39,4 +58,10 @@ std::pair }; } +std::pair + set_op_type_attr(OperatorType op_type) { + return set_attr_to_constant(OperatorAttributeKey::OP_TYPE, + OperatorAttributeValue{op_type}); +} + } // namespace FlexFlow diff --git a/lib/substitutions/src/substitutions/pcg_pattern.cc b/lib/substitutions/src/substitutions/pcg_pattern.cc index e53877006d..a0af875848 100644 --- a/lib/substitutions/src/substitutions/pcg_pattern.cc +++ b/lib/substitutions/src/substitutions/pcg_pattern.cc @@ -3,13 +3,23 @@ #include "substitutions/pcg_pattern_match.h" #include "substitutions/sub_parallel_computation_graph.h" #include "substitutions/tensor_pattern/satisfies_pattern.h" +#include "substitutions/unlabelled/find_pattern_matches.h" #include "substitutions/unlabelled/pattern_value.h" #include "utils/containers/map_values.h" #include "utils/containers/transform.h" #include "utils/graph/dataflow_graph/algorithms.h" +#include "utils/graph/node/algorithms.h" +#include "utils/graph/open_dataflow_graph/algorithms/get_inputs.h" +#include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.h" namespace FlexFlow { +std::unordered_set get_nodes(PCGPattern const &p) { + std::unordered_set raw_nodes = get_nodes(p.raw_graph); + + return transform(raw_nodes, [](Node const &n) { return PatternNode{n}; }); +} + static MatchAdditionalCriterion pcg_pattern_criteria(PCGPattern const &pattern, SubParallelComputationGraph const &pcg) { @@ -63,6 +73,14 @@ OperatorAttributePattern get_operator_pattern(PCGPattern const &p, return p.raw_graph.at(n.raw_node); } +std::unordered_set get_inputs(PCGPattern const &p) { + std::unordered_set raw_inputs = + get_open_dataflow_graph_inputs(p.raw_graph); + + return transform(raw_inputs, + [](DataflowGraphInput const &i) { return PatternInput{i}; }); +} + std::vector get_pattern_node_outputs(PCGPattern const &pattern, PatternNode const &node) { diff --git a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc index 0c673f0a8a..83df74f21b 100644 --- a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc +++ b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc @@ -188,33 +188,34 @@ bool sub_pcgs_are_isomorphic(SubParallelComputationGraph const &lhs, } std::string as_dot(SubParallelComputationGraph const &spcg) { - std::function get_node_label = - [](ParallelLayerAttrs const &a) -> std::string { - RecordFormatter r = as_dot(a.op_attrs); - - if (a.name.has_value()) { - RecordFormatter rr; - rr << "Name" << a.name.value(); - r << rr; - } - - std::ostringstream oss; - oss << r; - return oss.str(); - }; - - std::function get_input_label = - [](ParallelTensorAttrs const &a) -> std::string { - RecordFormatter r; - - r << fmt::to_string(a.shape); - - std::ostringstream oss; - oss << r; - return oss.str(); - }; - - return as_dot(spcg.raw_graph, get_node_label, get_input_label); + NOT_IMPLEMENTED(); + // std::function get_node_label = + // [](ParallelLayerAttrs const &a) -> std::string { + // RecordFormatter r = as_dot(a.op_attrs); + // + // if (a.name.has_value()) { + // RecordFormatter rr; + // rr << "Name" << a.name.value(); + // r << rr; + // } + // + // std::ostringstream oss; + // oss << r; + // return oss.str(); + // }; + // + // std::function get_input_label = + // [](ParallelTensorAttrs const &a) -> std::string { + // RecordFormatter r; + // + // r << fmt::to_string(a.shape); + // + // std::ostringstream oss; + // oss << r; + // return oss.str(); + // }; + // + // return as_dot(spcg.raw_graph, get_node_label, get_input_label); } void debug_print_dot(SubParallelComputationGraph const &spcg) { diff --git a/lib/substitutions/src/substitutions/sub_parallel_computation_graph_edge.cc b/lib/substitutions/src/substitutions/sub_parallel_computation_graph_edge.cc index bb8cb449bc..0d2b912049 100644 --- a/lib/substitutions/src/substitutions/sub_parallel_computation_graph_edge.cc +++ b/lib/substitutions/src/substitutions/sub_parallel_computation_graph_edge.cc @@ -6,7 +6,7 @@ namespace FlexFlow { SubParallelComputationGraphEdge subpcg_edge_from_tensor_and_dst(parallel_tensor_guid_t const &tensor, parallel_layer_guid_t const &layer, - int input_idx) { + nonnegative_int input_idx) { return SubParallelComputationGraphEdge{ OpenDataflowEdge{ DataflowEdge{ diff --git a/lib/substitutions/src/substitutions/substitution.cc b/lib/substitutions/src/substitutions/substitution.cc index 22e15cb01a..874700d303 100644 --- a/lib/substitutions/src/substitutions/substitution.cc +++ b/lib/substitutions/src/substitutions/substitution.cc @@ -1,169 +1,164 @@ #include "substitutions/substitution.h" -#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h" -#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" -#include "substitutions/open_parallel_tensor_guid_t.h" -#include "substitutions/output_graph/output_operator_attrs_assignment.h" -#include "substitutions/pcg_pattern_match.h" -#include "substitutions/sub_parallel_computation_graph.h" -#include "substitutions/sub_parallel_computation_graph_edge.h" -#include "substitutions/substitution_internal/evaluate_substitution_output.h" -#include "substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.h" -#include "utils/containers/merge_maps.h" -#include "utils/containers/restrict_keys.h" -#include "utils/containers/set_minus.h" -#include "utils/containers/values.h" -#include "utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.h" -#include "utils/graph/node/algorithms.h" -#include "utils/overload.h" +#include "substitutions/output_graph/output_graph_expr.h" +#include "substitutions/pcg_pattern.h" +#include "utils/bidict/algorithms/left_entries.h" +#include "utils/bidict/algorithms/right_entries.h" +#include "utils/containers/map_values.h" +#include "utils/graph/labelled_open_dataflow_graph/algorithms/find_isomorphism.h" +#include "utils/graph/labelled_open_dataflow_graph/algorithms/rewrite_node_labels.h" +#include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.dtg.h" +#include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h" namespace FlexFlow { -bool is_valid_substitution(Substitution const &) { - NOT_IMPLEMENTED(); -} +bool is_isomorphic_to(Substitution const &l, Substitution const &r) { + OpenDataflowGraphIsomorphism pcg_pattern_isomorphism = ({ + std::optional maybe_isomorphism = + find_isomorphism(l.pcg_pattern.raw_graph, r.pcg_pattern.raw_graph); -SubParallelComputationGraph - apply_substitution(SubParallelComputationGraph const &spcg, - Substitution const &sub, - PCGPatternMatch const &match) { - auto substitution_output_result = - evaluate_substitution_output(spcg, sub, match); - SubParallelComputationGraph substitution_output_graph = - substitution_output_result.first; - OutputExprToResultSubPCGMapping output_expr_to_result_sub_pcg_mapping = - substitution_output_result.second; - - SubParallelComputationGraphData output_graph_data = - get_sub_pcg_data(substitution_output_graph); - SubParallelComputationGraphData pre_data = get_sub_pcg_data(spcg); - - std::unordered_set pre_nodes = - keys(pre_data.node_data); - std::unordered_set matched_nodes = - unordered_set_of(values(match.node_assignment)); - std::unordered_set post_nodes_from_original_graph = - set_minus(pre_nodes, matched_nodes); - - std::unordered_map post_node_data = - [&] { - std::unordered_map - post_node_data_from_orig = restrict_keys( - pre_data.node_data, post_nodes_from_original_graph); - std::unordered_map - post_node_data_from_sub = output_graph_data.node_data; - - return merge_maps(post_node_data_from_orig, post_node_data_from_sub); - }(); - - std::unordered_set post_edges = [&] { - std::unordered_set post_edges_from_orig = - filter(pre_data.edges, [&](SubParallelComputationGraphEdge const &e) { - if (e.raw_edge.has()) { - return true; - } else { - DataflowEdge dfe = e.raw_edge.get(); - parallel_layer_guid_t src = parallel_layer_guid_t{dfe.src.node}; - parallel_layer_guid_t dst = parallel_layer_guid_t{dfe.dst.node}; - return !(contains(matched_nodes, src) || - contains(matched_nodes, dst)); - } - }); - - std::unordered_set post_edges_from_sub = - filter(output_graph_data.edges, - [&](SubParallelComputationGraphEdge const &e) { - return !e.raw_edge.has(); - }); - - bidict - output_orig_pattern_mapping = get_output_mapping_for_pcg_pattern_match( - match, sub.pcg_pattern, spcg); - bidict - output_post_outexpr_mapping = get_output_graph_expr_output_mapping( - output_expr_to_result_sub_pcg_mapping, - sub.output_graph_expr, - substitution_output_graph); - - std::unordered_set incoming_to_sub_edges; - for (auto const &[pattern_input, base_graph_tensor] : - match.input_assignment) { - OutputGraphExprInput output_expr_input = - sub.inputs_mapping.at_l(pattern_input); - input_parallel_tensor_guid_t output_graph_input = - output_expr_to_result_sub_pcg_mapping.input_mapping.at_r( - output_expr_input); - std::unordered_set uses = get_parallel_tensor_uses( - substitution_output_graph, - open_parallel_tensor_guid_from_input(output_graph_input)); - for (parallel_tensor_use_t const &use : uses) { - SubParallelComputationGraphEdge new_edge = - subpcg_edge_from_tensor_and_use(base_graph_tensor, use); - incoming_to_sub_edges.insert(new_edge); - } + if (!maybe_isomorphism.has_value()) { + return false; } - std::unordered_set outgoing_from_sub_edges; - for (ParallelComputationGraphEdge const &outgoing_edge : - get_subgraph_outgoing_edges(spcg, matched_nodes)) { - parallel_tensor_guid_t original_tensor = - get_parallel_tensor(outgoing_edge); - PatternNodeOutput pattern_tensor = - output_orig_pattern_mapping.at_r(original_tensor); - OutputGraphExprNodeOutput output_graph_tensor = - sub.outputs_mapping.at_l(pattern_tensor); - parallel_tensor_guid_t new_tensor = - output_post_outexpr_mapping.at_r(output_graph_tensor); - - SubParallelComputationGraphEdge new_edge = - subpcg_edge_from_tensor_and_dst( - new_tensor, - get_dst_layer(outgoing_edge), - get_dst_layer_input_idx(outgoing_edge)); - outgoing_from_sub_edges.insert(new_edge); - } + maybe_isomorphism.value(); + }); + + auto l_from_r_pattern_node = [&](PatternNode const &r_node) { + return PatternNode{ + pcg_pattern_isomorphism.node_mapping.at_r(r_node.raw_node), + }; + }; - return set_union(std::vector{ - post_edges_from_orig, - post_edges_from_sub, - incoming_to_sub_edges, - outgoing_from_sub_edges, - }); - }(); - - std::unordered_set post_inputs = - pre_data.inputs; - - std::unordered_map - post_value_data = [&] { - std::unordered_map - post_value_data_from_orig = filter_keys( - pre_data.value_data, [&](open_parallel_tensor_guid_t const &t) { - return visit_open_parallel_tensor_guid( - t, - overload{ - [&](parallel_tensor_guid_t const &t) { - return contains(post_nodes_from_original_graph, - get_source_layer(t)); - }, - [](input_parallel_tensor_guid_t const &) { - return true; - }, - }); + auto l_from_r_output_attrs_assignment = + [&](OutputOperatorAttrsAssignment const &r_attrs) { + std::optional l_template_operator = + transform(r_attrs.template_operator, l_from_r_pattern_node); + std::unordered_map + l_assignments = map_values( + r_attrs.assignments, + [&](OutputOperatorAttributeExpr const &r_expr) { + return r_expr.visit( + overload{[&](AttrConstant const &) { return r_expr; }, + [&](OutputOperatorAttrAccess const &r_acc) { + return OutputOperatorAttributeExpr{ + OutputOperatorAttrAccess{ + l_from_r_pattern_node(r_acc.node), + r_acc.attr_expr, + }, + }; + }}); }); + return OutputOperatorAttrsAssignment{ + l_template_operator, + l_assignments, + }; + }; + + OpenDataflowGraphIsomorphism output_graph_expr_isomorphism = ({ + std::optional maybe_isomorphism = + find_isomorphism( + l.output_graph_expr.raw_graph, + rewrite_node_labels( + r.output_graph_expr.raw_graph, + [&](Node const &, OutputOperatorAttrsAssignment const &a) { + return l_from_r_output_attrs_assignment(a); + })); + if (!maybe_isomorphism.has_value()) { + return false; + } - std::unordered_map - post_value_data_from_sub = output_graph_data.value_data; - return merge_maps(post_value_data_from_orig, post_value_data_from_sub); - }(); + maybe_isomorphism.value(); + }); - SubParallelComputationGraphData post_data = SubParallelComputationGraphData{ - post_node_data, - post_edges, - post_inputs, - post_value_data, + auto l_from_r_pattern_input = [&](PatternInput const &r_input) { + return PatternInput{ + pcg_pattern_isomorphism.input_mapping.at_r( + r_input.raw_dataflow_graph_input), + }; }; - return sub_pcg_from_graph_data(post_data); + auto l_from_r_output_graph_input = [&](OutputGraphExprInput const &r_input) { + return OutputGraphExprInput{ + output_graph_expr_isomorphism.input_mapping.at_r( + r_input.raw_dataflow_graph_input), + }; + }; + + auto l_from_r_pattern_output = [&](PatternNodeOutput const &r_output) { + return PatternNodeOutput{ + isomorphism_map_l_dataflow_output_from_r(pcg_pattern_isomorphism, + r_output.raw_dataflow_output), + }; + }; + + auto l_from_r_output_graph_output = + [&](OutputGraphExprNodeOutput const &r_output) { + return OutputGraphExprNodeOutput{ + isomorphism_map_l_dataflow_output_from_r( + output_graph_expr_isomorphism, r_output.raw_dataflow_output), + }; + }; + + bidict l_input_mapping_from_r = + transform(r.inputs_mapping, + [&](PatternInput const &r_p, OutputGraphExprInput const &r_o) { + return std::pair{ + l_from_r_pattern_input(r_p), + l_from_r_output_graph_input(r_o), + }; + }); + if (l_input_mapping_from_r != l.inputs_mapping) { + return false; + } + + bidict l_output_mapping_from_r = + transform(r.outputs_mapping, + [&](PatternNodeOutput const &r_p, + OutputGraphExprNodeOutput const &r_o) { + return std::pair{ + l_from_r_pattern_output(r_p), + l_from_r_output_graph_output(r_o), + }; + }); + if (l_output_mapping_from_r != l.outputs_mapping) { + return false; + } + + return true; +} + +bool is_valid_substitution(Substitution const &sub) { + { + std::unordered_set pattern_inputs = + get_inputs(sub.pcg_pattern); + std::unordered_set mapped_inputs = + left_entries(sub.inputs_mapping); + + if (pattern_inputs != mapped_inputs) { + return false; + } + } + + { + std::unordered_set output_graph_inputs = + get_inputs(sub.output_graph_expr); + std::unordered_set mapped_inputs = + right_entries(sub.inputs_mapping); + + if (output_graph_inputs != mapped_inputs) { + return false; + } + } + + if (get_nodes(sub.pcg_pattern).empty()) { + return false; + } + + if (get_nodes(sub.output_graph_expr).empty()) { + return false; + } + + return true; } } // namespace FlexFlow diff --git a/lib/substitutions/src/substitutions/substitution_builder.cc b/lib/substitutions/src/substitutions/substitution_builder.cc new file mode 100644 index 0000000000..a267b8113f --- /dev/null +++ b/lib/substitutions/src/substitutions/substitution_builder.cc @@ -0,0 +1,162 @@ +#include "substitutions/substitution_builder.h" +#include "substitutions/output_graph/output_graph_expr_value.h" +#include "substitutions/substitution.h" +#include "substitutions/unlabelled/pattern_value.h" +#include "utils/containers/repeat_element.h" +#include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h" +#include "utils/overload.h" + +namespace FlexFlow { + +SubstitutionBuilder::SubstitutionBuilder() + : pattern_g(LabelledOpenDataflowGraph:: + create>()), + output_g(LabelledOpenDataflowGraph:: + create>()) {} + +std::pair SubstitutionBuilder::add_input( + TensorAttributePattern const &input_tensor_pattern, + std::optional const &name) { + PatternInput pattern_input = PatternInput{ + this->pattern_g.add_input(input_tensor_pattern), + }; + + OutputGraphExprInput output_graph_expr_input = OutputGraphExprInput{ + this->output_g.add_input(std::monostate{}), + }; + + this->input_mapping.equate(pattern_input, output_graph_expr_input); + + if (name.has_value()) { + this->pattern_input_names.equate(pattern_input, name.value()); + } + + return { + PatternValue{pattern_input}, + OutputGraphExprValue{output_graph_expr_input}, + }; +} + +std::vector SubstitutionBuilder::add_pattern_node( + OperatorAttributePattern const &node_pattern, + std::vector const &inputs, + std::vector const &output_patterns, + std::optional const &maybe_name) { + NodeAddedResult node_added = this->pattern_g.add_node( + node_pattern, + transform(inputs, raw_open_dataflow_value_from_pattern_value), + output_patterns); + + if (maybe_name.has_value()) { + std::string name = maybe_name.value(); + + if (this->pattern_node_names.contains_r(name)) { + throw mk_runtime_error(fmt::format("Attempted to name node {}, but a " + "node with that name already exists!", + name)); + } + + this->pattern_node_names.equate(PatternNode{node_added.node}, name); + } + + return transform(node_added.outputs, [](DataflowOutput const &o) { + return pattern_value_from_raw_open_dataflow_value(OpenDataflowValue{o}); + }); +} + +std::vector SubstitutionBuilder::add_output_graph_node( + OutputOperatorAttrsAssignment const &node_expr, + std::vector const &inputs, + nonnegative_int num_outputs) { + NodeAddedResult node_added = this->output_g.add_node( + node_expr, + transform(inputs, raw_open_dataflow_value_from_output_graph_expr_value), + repeat_element(/*num_times=*/num_outputs, /*element=*/std::monostate{})); + + return transform(node_added.outputs, [](DataflowOutput const &o) { + return output_graph_expr_value_from_raw_open_dataflow_value( + OpenDataflowValue{o}); + }); +} + +void SubstitutionBuilder::equate_outputs( + PatternValue const &maybe_pattern_output, + OutputGraphExprValue const &maybe_output_graph_expr_output) { + PatternNodeOutput pattern_output = + maybe_pattern_output.visit(overload{ + [](PatternNodeOutput const &o) { return o; }, + [&](PatternInput const &) -> PatternNodeOutput { + throw mk_runtime_error(fmt::format( + "SubstitutionBuilder::equate_outputs expected a PatternValue " + "holding a PatternNodeOutput, but received {}", + maybe_pattern_output)); + }, + }); + + OutputGraphExprNodeOutput output_graph_expr_output = + maybe_output_graph_expr_output.visit(overload{ + [](OutputGraphExprNodeOutput const &o) { return o; }, + [&](OutputGraphExprInput const &) -> OutputGraphExprNodeOutput { + throw mk_runtime_error( + fmt::format("SubstitutionBuilder::equate_outputs expected an " + "OutputGraphExprValue holding a " + "OutputGraphExprNodeOutput, but received {}", + maybe_output_graph_expr_output)); + }, + }); + + if (this->output_mapping.contains_l(pattern_output)) { + throw mk_runtime_error( + fmt::format("SubstitutionBuilder::equate_outputs expected a " + "PatternValue holding a PatternValueOutput" + "that is not contained in the output_mapping forward graph," + "but received {}", + pattern_output)); + } + if (this->output_mapping.contains_r(output_graph_expr_output)) { + throw mk_runtime_error(fmt::format( + "SubstitutionBuilder::output_graph_expr_output expected a " + "OutputGraphExprValue holding a OutputGraphExprNodeOutput" + "that is not contained in the output_mapping backward graph," + "but received {}", + output_graph_expr_output)); + } + + this->output_mapping.equate(pattern_output, output_graph_expr_output); +} + +PatternNode + SubstitutionBuilder::pattern_node_named(std::string const &name) const { + return this->pattern_node_names.at_r(name); +} + +PatternInput + SubstitutionBuilder::pattern_input_named(std::string const &name) const { + return this->pattern_input_names.at_r(name); +} + +Substitution SubstitutionBuilder::get_substitution() const { + Substitution result = Substitution{ + PCGPattern{this->pattern_g}, + OutputGraphExpr{this->output_g}, + this->input_mapping, + this->output_mapping, + }; + + if (!is_valid_substitution(result)) { + throw mk_runtime_error( + "get_substitution cannot return a Substitution, as the Substitution is " + "currently invalid. Ensure you have finished constructing the " + "Substitution and have mapped all of the outputs."); + } + + return result; +} + +} // namespace FlexFlow diff --git a/lib/substitutions/src/substitutions/tensor_pattern/eval_list_access.cc b/lib/substitutions/src/substitutions/tensor_pattern/eval_list_access.cc index efbcf4a6f1..7bfb1f5e9e 100644 --- a/lib/substitutions/src/substitutions/tensor_pattern/eval_list_access.cc +++ b/lib/substitutions/src/substitutions/tensor_pattern/eval_list_access.cc @@ -11,9 +11,8 @@ TensorAttributeValue TensorAttributeValue from_attr = get_attribute(attrs, acc.attribute_key); return from_attr.visit(overload{ - [&](std::vector const &v) -> TensorAttributeValue { - return TensorAttributeValue{ - static_cast(at_idx(v, acc.index).value())}; + [&](std::vector const &v) -> TensorAttributeValue { + return TensorAttributeValue{at_idx(v, acc.index).value()}; }, [](auto &&) -> TensorAttributeValue { throw mk_runtime_error("Invalid operand"); diff --git a/lib/substitutions/src/substitutions/tensor_pattern/eval_list_size.cc b/lib/substitutions/src/substitutions/tensor_pattern/eval_list_size.cc index d1e97adc37..5acfdf406a 100644 --- a/lib/substitutions/src/substitutions/tensor_pattern/eval_list_size.cc +++ b/lib/substitutions/src/substitutions/tensor_pattern/eval_list_size.cc @@ -1,5 +1,6 @@ #include "substitutions/tensor_pattern/eval_list_size.h" #include "substitutions/tensor_pattern/get_attribute.h" +#include "utils/nonnegative_int/num_elements.h" #include "utils/overload.h" namespace FlexFlow { @@ -9,8 +10,8 @@ TensorAttributeValue eval_list_size(ParallelTensorAttrs const &attrs, TensorAttributeValue from_attr = get_attribute(attrs, acc.attribute_key); return from_attr.visit(overload{ - [](std::vector const &v) -> TensorAttributeValue { - return TensorAttributeValue{v.size()}; + [](std::vector const &v) -> TensorAttributeValue { + return TensorAttributeValue{num_elements(v)}; }, [](auto &&) -> TensorAttributeValue { throw mk_runtime_error("Invalid operand"); diff --git a/lib/substitutions/src/substitutions/tensor_pattern/get_attribute.cc b/lib/substitutions/src/substitutions/tensor_pattern/get_attribute.cc index 286bc69b84..3539b06832 100644 --- a/lib/substitutions/src/substitutions/tensor_pattern/get_attribute.cc +++ b/lib/substitutions/src/substitutions/tensor_pattern/get_attribute.cc @@ -10,15 +10,15 @@ TensorAttributeValue get_attribute(ParallelTensorAttrs const &attrs, TensorAttributeKey key) { switch (key) { case TensorAttributeKey::DIM_SIZES: { - std::vector sizes = - transform(vector_of(ff_ordered_shard_dims(attrs.shape.dims)), - [](ShardParallelDim const &d) { return d.size; }); + std::vector sizes = transform( + vector_of(ff_ordered_shard_dims(attrs.shape.dims)), + [](ShardParallelDim const &d) { return nonnegative_int{d.size}; }); return TensorAttributeValue{sizes}; } case TensorAttributeKey::DIM_DEGREES: { - std::vector degrees = transform( + std::vector degrees = transform( vector_of(ff_ordered_shard_dims(attrs.shape.dims)), - [](ShardParallelDim const &d) { return size_t_from_int(d.degree); }); + [](ShardParallelDim const &d) { return nonnegative_int{d.degree}; }); return TensorAttributeValue{degrees}; } default: diff --git a/lib/substitutions/src/substitutions/tensor_pattern/tensor_attribute_pattern.cc b/lib/substitutions/src/substitutions/tensor_pattern/tensor_attribute_pattern.cc index 794ab5abda..e1c1fe7cf6 100644 --- a/lib/substitutions/src/substitutions/tensor_pattern/tensor_attribute_pattern.cc +++ b/lib/substitutions/src/substitutions/tensor_pattern/tensor_attribute_pattern.cc @@ -1,4 +1,5 @@ #include "substitutions/tensor_pattern/tensor_attribute_pattern.h" +#include "utils/integer_conversions.h" namespace FlexFlow { @@ -6,4 +7,19 @@ TensorAttributePattern tensor_attribute_pattern_match_all() { return TensorAttributePattern{{}}; } +TensorAttributePattern + tensor_attr_pattern_require_num_dims(nonnegative_int num_dims) { + return TensorAttributePattern{{ + TensorAttributeConstraint{ + ConstraintType::EQUAL, + TensorAttributeExpr{ + TensorAttributeListSize{ + TensorAttributeKey::DIM_SIZES, + }, + }, + TensorAttributeValue{num_dims}, + }, + }}; +} + } // namespace FlexFlow diff --git a/lib/substitutions/src/substitutions/unity_substitution_set.cc b/lib/substitutions/src/substitutions/unity_substitution_set.cc new file mode 100644 index 0000000000..4b00cdd95f --- /dev/null +++ b/lib/substitutions/src/substitutions/unity_substitution_set.cc @@ -0,0 +1,235 @@ +#include "substitutions/unity_substitution_set.h" +#include "pcg/machine_specification.h" +#include "substitutions/operator_pattern/operator_attribute_constraint.h" +#include "substitutions/output_graph/output_operator_attrs_assignment.h" +#include "substitutions/substitution_builder.h" +#include "substitutions/tensor_pattern/tensor_attribute_pattern.h" +#include "utils/containers/get_only.h" +#include "utils/nonnegative_int/nonnegative_int.h" +#include "utils/nonnegative_int/nonnegative_range.h" + +namespace FlexFlow { + +std::vector + get_substitution_set(MachineSpecification const &resources) { + std::vector substitutions; + for (nonnegative_int num_dims : + nonnegative_range(1_n, nonnegative_int{MAX_TENSOR_DIM})) { + for (nonnegative_int degree = 1_n; degree <= get_num_gpus(resources); + degree *= 2_n) { + substitutions.push_back( + create_replicate_linear_combine(num_dims, degree, true)); + substitutions.push_back( + create_replicate_linear_combine(num_dims, degree, false)); + } + } + substitutions.push_back(create_fuse_linear_activation(Activation::RELU)); + substitutions.push_back(create_fuse_linear_activation(Activation::SIGMOID)); + substitutions.push_back(create_fuse_linear_activation(Activation::TANH)); + substitutions.push_back(create_fuse_linear_activation(Activation::GELU)); + return substitutions; +} + +Substitution create_combine_inception(nonnegative_int num_convs, + nonnegative_int num_dims, + nonnegative_int degree) { + NOT_IMPLEMENTED(); +} + +Substitution create_combine_concat(nonnegative_int num_inputs, + nonnegative_int num_dims, + nonnegative_int degree) { + NOT_IMPLEMENTED(); +} + +Substitution create_replicate_linear_combine(nonnegative_int num_dims, + nonnegative_int degree, + bool use_bias) { + SubstitutionBuilder b; + + auto [p_input, o_input] = b.add_input(tensor_attribute_pattern_match_all()); + auto [p_weight, o_weight] = b.add_input(tensor_attribute_pattern_match_all()); + std::vector p_inputs = {p_input, p_weight}; + + std::optional o_bias = std::nullopt; + if (use_bias) { + std::pair bias = + b.add_input(tensor_attribute_pattern_match_all()); + p_inputs.push_back(bias.first); + o_bias = bias.second; + } + + OperatorAttributePattern linear_pattern = OperatorAttributePattern{{ + op_type_equals_constraint(OperatorType::LINEAR), + op_attr_key_equals(OperatorAttributeKey::BIAS, + OperatorAttributeValue{use_bias}), + op_attr_key_divisible_by(OperatorAttributeKey::OUT_CHANNELS, + nonnegative_int{degree}), + }}; + + PatternValue p_linear_output = get_only(b.add_pattern_node( + linear_pattern, + p_inputs, + {tensor_attr_pattern_require_num_dims(nonnegative_int{num_dims})}, + "linear")); + + OutputOperatorAttrsAssignment replicate_input_expr = + OutputOperatorAttrsAssignment{ + std::nullopt, + { + set_op_type_attr(OperatorType::REPLICATE), + set_attr_to_constant(OperatorAttributeKey::PARALLEL_DEGREE, + OperatorAttributeValue{degree}), + }}; + OutputGraphExprValue o_replicate_input_output = + get_only(b.add_output_graph_node(replicate_input_expr, {o_input}, 1_n)); + + OutputOperatorAttrsAssignment partition_weights_expr = + OutputOperatorAttrsAssignment{ + std::nullopt, + { + set_op_type_attr(OperatorType::REPARTITION), + set_attr_to_constant(OperatorAttributeKey::PARALLEL_DEGREE, + OperatorAttributeValue{degree}), + set_attr_to_constant(OperatorAttributeKey::PARALLEL_DIM, + OperatorAttributeValue{ff_dim_t{1_n}}), + }}; + OutputGraphExprValue o_partition_weights_output = get_only( + b.add_output_graph_node(partition_weights_expr, {o_weight}, 1_n)); + + std::vector o_linear_inputs = { + o_replicate_input_output, o_partition_weights_output}; + + if (use_bias) { + OutputOperatorAttrsAssignment partition_bias_expr = + OutputOperatorAttrsAssignment{ + std::nullopt, + { + set_op_type_attr(OperatorType::REPARTITION), + set_attr_to_constant(OperatorAttributeKey::PARALLEL_DEGREE, + OperatorAttributeValue{degree}), + set_attr_to_constant(OperatorAttributeKey::PARALLEL_DIM, + OperatorAttributeValue{ff_dim_t{1_n}}), + }}; + OutputGraphExprValue o_partition_bias_output = get_only( + b.add_output_graph_node(partition_bias_expr, {o_bias.value()}, 1_n)); + o_linear_inputs.push_back(o_partition_bias_output); + } + + OutputOperatorAttrsAssignment linear_expr = OutputOperatorAttrsAssignment{ + b.pattern_node_named("linear"), + {}, + }; + OutputGraphExprValue o_linear_output = + get_only(b.add_output_graph_node(linear_expr, o_linear_inputs, 1_n)); + + OutputOperatorAttrsAssignment combine_expr = OutputOperatorAttrsAssignment{ + std::nullopt, + { + set_op_type_attr(OperatorType::COMBINE), + set_attr_to_constant(OperatorAttributeKey::PARALLEL_DEGREE, + OperatorAttributeValue{degree}), + set_attr_to_constant( + OperatorAttributeKey::PARALLEL_DIM, + OperatorAttributeValue{ff_dim_t{ + nonnegative_int{num_dims.unwrap_nonnegative() - 1}, + }}), + }, + }; + OutputGraphExprValue o_combine_output = + get_only(b.add_output_graph_node(combine_expr, {o_linear_output}, 1_n)); + + b.equate_outputs(p_linear_output, o_combine_output); + + return b.get_substitution(); +} + +Substitution create_partition_linear_combine(nonnegative_int num_dims, + nonnegative_int degree, + Activation activation, + bool use_bias) { + NOT_IMPLEMENTED(); +} + +Substitution create_partition_conv2d_combine(nonnegative_int num_dims, + nonnegative_int degree) { + NOT_IMPLEMENTED(); +} + +Substitution create_partition_attention_combine(nonnegative_int num_heads, + nonnegative_int degree) { + NOT_IMPLEMENTED(); +} + +Substitution create_replicate_attention_reduce(nonnegative_int num_heads, + nonnegative_int degree) { + NOT_IMPLEMENTED(); +} + +Substitution create_partition_add_combine(ff_dim_t parallel_dim, + nonnegative_int degree) { + NOT_IMPLEMENTED(); +} + +Substitution create_partition_relu_combine(ff_dim_t parallel_dim, + nonnegative_int degree) { + NOT_IMPLEMENTED(); +} + +Substitution create_partition_concat_combine(nonnegative_int num_inputs, + ff_dim_t concat_dim, + ff_dim_t parallel_dim, + nonnegative_int degree) { + NOT_IMPLEMENTED(); +} + +Substitution create_partition_softmax_combine(ff_dim_t softmax_dim, + ff_dim_t partition_dim, + nonnegative_int degree) { + NOT_IMPLEMENTED(); +} + +Substitution create_fuse_linear_activation(Activation activation) { + SubstitutionBuilder b; + + auto [p_input, o_input] = + b.add_input(tensor_attribute_pattern_match_all(), "input"); + auto [p_weight, o_weight] = + b.add_input(tensor_attribute_pattern_match_all(), "weight"); + + OperatorAttributePattern mm_pattern = OperatorAttributePattern{{ + op_type_equals_constraint(OperatorType::LINEAR), + op_attr_key_equals( + OperatorAttributeKey::ACTIVATION, + OperatorAttributeValue{std::optional{std::nullopt}}), + }}; + PatternValue p_mm_output = + get_only(b.add_pattern_node(mm_pattern, + {p_input, p_weight}, + {tensor_attribute_pattern_match_all()}, + "mm")); + + OperatorAttributePattern relu_pattern = OperatorAttributePattern{{ + op_type_equals_constraint(OperatorType::RELU), + }}; + PatternValue p_relu_output = + get_only(b.add_pattern_node(relu_pattern, + {p_mm_output}, + {tensor_attribute_pattern_match_all()}, + "relu")); + + OutputOperatorAttrsAssignment fused_node_expr = OutputOperatorAttrsAssignment{ + b.pattern_node_named("mm"), + { + set_attr_to_constant(OperatorAttributeKey::ACTIVATION, + OperatorAttributeValue{activation}), + }}; + OutputGraphExprValue o_fused_node_output = get_only( + b.add_output_graph_node(fused_node_expr, {o_input, o_weight}, 1_n)); + + b.equate_outputs(p_relu_output, o_fused_node_output); + + return b.get_substitution(); +} + +} // namespace FlexFlow diff --git a/lib/substitutions/src/substitutions/unlabelled/input_pattern_edge.cc b/lib/substitutions/src/substitutions/unlabelled/input_pattern_edge.cc index e8deacebec..dff600ecf0 100644 --- a/lib/substitutions/src/substitutions/unlabelled/input_pattern_edge.cc +++ b/lib/substitutions/src/substitutions/unlabelled/input_pattern_edge.cc @@ -11,7 +11,7 @@ PatternNode get_dst_node(InputPatternEdge const &e) { return PatternNode{e.raw_edge.dst.node}; } -int get_dst_idx(InputPatternEdge const &e) { +nonnegative_int get_dst_idx(InputPatternEdge const &e) { return e.raw_edge.dst.idx; } diff --git a/lib/substitutions/src/substitutions/unlabelled/pattern_node_output.cc b/lib/substitutions/src/substitutions/unlabelled/pattern_node_output.cc index 9abdc4e83c..24bbb6f4d1 100644 --- a/lib/substitutions/src/substitutions/unlabelled/pattern_node_output.cc +++ b/lib/substitutions/src/substitutions/unlabelled/pattern_node_output.cc @@ -6,7 +6,7 @@ PatternNode get_src_node(PatternNodeOutput const &o) { return PatternNode{o.raw_dataflow_output.node}; } -int get_idx(PatternNodeOutput const &o) { +nonnegative_int get_idx(PatternNodeOutput const &o) { return o.raw_dataflow_output.idx; } diff --git a/lib/substitutions/src/substitutions/unlabelled/standard_pattern_edge.cc b/lib/substitutions/src/substitutions/unlabelled/standard_pattern_edge.cc index dea3e5f500..17d05f1122 100644 --- a/lib/substitutions/src/substitutions/unlabelled/standard_pattern_edge.cc +++ b/lib/substitutions/src/substitutions/unlabelled/standard_pattern_edge.cc @@ -10,11 +10,11 @@ PatternNode get_dst_node(StandardPatternEdge const &e) { return PatternNode{e.raw_edge.dst.node}; } -int get_src_idx(StandardPatternEdge const &e) { +nonnegative_int get_src_idx(StandardPatternEdge const &e) { return e.raw_edge.src.idx; } -int get_dst_idx(StandardPatternEdge const &e) { +nonnegative_int get_dst_idx(StandardPatternEdge const &e) { return e.raw_edge.dst.idx; } diff --git a/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc b/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc new file mode 100644 index 0000000000..5fd923f71f --- /dev/null +++ b/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc @@ -0,0 +1,174 @@ +#include "substitutions/apply_substitution/apply_substitution.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" +#include "substitutions/operator_pattern/operator_attribute_constraint.h" +#include "substitutions/output_graph/output_operator_attrs_assignment.h" +#include "substitutions/sub_parallel_computation_graph.h" +#include "substitutions/substitution_builder.h" +#include "substitutions/tensor_pattern/tensor_attribute_pattern.h" +#include "utils/containers/get_only.h" +#include "utils/integer_conversions.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("apply_substitution") { + SubstitutionBuilder b; + + auto [p_input, o_input] = + b.add_input(tensor_attribute_pattern_match_all(), "input"); + auto [p_weight, o_weight] = + b.add_input(tensor_attribute_pattern_match_all(), "weight"); + + PatternValue p_mm_output = [&] { + auto pattern = OperatorAttributePattern{{ + op_type_equals_constraint(OperatorType::LINEAR), + op_attr_key_equals( + OperatorAttributeKey::ACTIVATION, + OperatorAttributeValue{std::optional{std::nullopt}}), + }}; + + return get_only(b.add_pattern_node(pattern, + {p_input, p_weight}, + {tensor_attribute_pattern_match_all()}, + "mm")); + }(); + + PatternValue p_relu_output = [&] { + auto pattern = OperatorAttributePattern{{ + op_type_equals_constraint(OperatorType::RELU), + }}; + + return get_only(b.add_pattern_node(pattern, + {p_mm_output}, + {tensor_attribute_pattern_match_all()}, + "relu")); + }(); + + OutputGraphExprValue o_fused_output = [&] { + auto node_expr = OutputOperatorAttrsAssignment{ + b.pattern_node_named("mm"), + { + set_attr_to_constant(OperatorAttributeKey::ACTIVATION, + OperatorAttributeValue{Activation::RELU}), + }}; + + return get_only( + b.add_output_graph_node(node_expr, {o_input, o_weight}, 1_n)); + }(); + + b.equate_outputs(p_relu_output, o_fused_output); + + Substitution sub = b.get_substitution(); + + nonnegative_int in_channels = 24_n; + nonnegative_int batch_size = 4_n; + nonnegative_int batch_degree = 2_n; + std::string mm_match = "mm_match"; + std::string relu_match = "relu_match"; + + SubParallelComputationGraph pcg = [&] { + ParallelComputationGraphBuilder b; + parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{ + ParallelTensorDims{ + FFOrdered{ + ShardParallelDim{batch_size, batch_degree}, + ShardParallelDim{in_channels, 1_n}, + }, + ReplicaParallelDimSet{ + SumDegree{1_n}, + DiscardCopyDegree{1_n}, + }, + }, + DataType::FLOAT, + }); + t = b.dense(t, + /*outDim=*/16_n, + /*activation=*/std::nullopt); + t = b.gelu(t); + t = b.dense(t, + /*outDim=*/12_n, + /*activation=*/std::nullopt, + /*use_bias=*/false, + /*data_type=*/DataType::FLOAT, + /*kernel_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt, + /*name=*/mm_match); + t = b.relu(t, + /*name=*/relu_match); + t = b.dense(t, + /*outDim=*/8_n, + /*activation=*/Activation::RELU); + + return sub_pcg_from_full_pcg(b.pcg); + }(); + + PCGPatternMatch match = [&] { + parallel_layer_guid_t mm_match_layer = + get_parallel_layer_by_name(pcg, mm_match); + parallel_layer_guid_t relu_match_layer = + get_parallel_layer_by_name(pcg, relu_match); + open_parallel_tensor_guid_t mm_match_layer_input_activations = + get_layer_inputs(pcg, mm_match_layer).at(0); + open_parallel_tensor_guid_t mm_match_layer_input_weights = + get_layer_inputs(pcg, mm_match_layer).at(1); + + return PCGPatternMatch{ + bidict{ + {b.pattern_node_named("mm"), mm_match_layer}, + {b.pattern_node_named("relu"), relu_match_layer}, + }, + std::unordered_map{ + { + b.pattern_input_named("input"), + mm_match_layer_input_activations, + }, + { + b.pattern_input_named("weight"), + mm_match_layer_input_weights, + }}, + }; + }(); + + SubParallelComputationGraph result = apply_substitution(pcg, sub, match); + + SubParallelComputationGraph correct = [&] { + ParallelComputationGraphBuilder b; + parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{ + ParallelTensorDims{ + FFOrdered{ + ShardParallelDim{batch_size, batch_degree}, + ShardParallelDim{in_channels, 1_n}, + }, + ReplicaParallelDimSet{ + SumDegree{1_n}, + DiscardCopyDegree{1_n}, + }, + }, + DataType::FLOAT, + }); + t = b.dense(t, + /*outDim=*/16_n, + /*activation=*/std::nullopt); + t = b.gelu(t); + t = b.dense(t, + /*outDim=*/12_n, + /*activation=*/Activation::RELU, + /*use_bias=*/false, + /*data_type=*/DataType::FLOAT, + /*kernel_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt, + /*name=*/std::nullopt); + t = b.dense(t, + /*outDim=*/8_n, + /*activation=*/Activation::RELU); + + return sub_pcg_from_full_pcg(b.pcg); + }(); + + // since the new nodes produced by the substitution have new ids, it's + // easier/more correct to check that the graphs are isomorphic rather than + // checking their exact graph data + CHECK(sub_pcgs_are_isomorphic(result, correct)); + } +} diff --git a/lib/substitutions/test/src/substitutions/substitution_internal/evaluate_substitution_output.cc b/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc similarity index 86% rename from lib/substitutions/test/src/substitutions/substitution_internal/evaluate_substitution_output.cc rename to lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc index 52b54b32fb..7bdcc5a3bd 100644 --- a/lib/substitutions/test/src/substitutions/substitution_internal/evaluate_substitution_output.cc +++ b/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc @@ -1,4 +1,4 @@ -#include "substitutions/substitution_internal/evaluate_substitution_output.h" +#include "substitutions/apply_substitution/evaluate_substitution_output.h" #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" #include "substitutions/open_parallel_tensor_guid_t.h" #include "substitutions/operator_pattern/operator_attribute_constraint.h" @@ -64,20 +64,23 @@ TEST_SUITE(FF_TEST_SUITE) { OutputGraphExprInput{output_g.add_input({})}; OutputOperatorAttrsAssignment fused_mm_relu_attrs_assignment = - OutputOperatorAttrsAssignment{{ - set_attr_to_constant(OperatorAttributeKey::OP_TYPE, - OperatorAttributeValue{OperatorType::LINEAR}), - copy_attr_from_pattern_node(OperatorAttributeKey::OUT_CHANNELS, - pattern_mm_node), - copy_attr_from_pattern_node(OperatorAttributeKey::USE_BIAS, - pattern_mm_node), - copy_attr_from_pattern_node(OperatorAttributeKey::DATA_TYPE, - pattern_mm_node), - set_attr_to_constant(OperatorAttributeKey::ACTIVATION, - OperatorAttributeValue{Activation::RELU}), - copy_attr_from_pattern_node(OperatorAttributeKey::REGULARIZER, - pattern_mm_node), - }}; + OutputOperatorAttrsAssignment{ + std::nullopt, + { + set_attr_to_constant( + OperatorAttributeKey::OP_TYPE, + OperatorAttributeValue{OperatorType::LINEAR}), + copy_attr_from_pattern_node(OperatorAttributeKey::OUT_CHANNELS, + pattern_mm_node), + copy_attr_from_pattern_node(OperatorAttributeKey::USE_BIAS, + pattern_mm_node), + copy_attr_from_pattern_node(OperatorAttributeKey::DATA_TYPE, + pattern_mm_node), + set_attr_to_constant(OperatorAttributeKey::ACTIVATION, + OperatorAttributeValue{Activation::RELU}), + copy_attr_from_pattern_node(OperatorAttributeKey::REGULARIZER, + pattern_mm_node), + }}; NodeAddedResult fused_mm_relu_added = output_g.add_node( fused_mm_relu_attrs_assignment, {OpenDataflowValue{output_i_activation.raw_dataflow_graph_input}, @@ -108,9 +111,9 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - int in_channels = 24; - int batch_size = 4; - int batch_degree = 2; + nonnegative_int in_channels = 24_n; + nonnegative_int batch_size = 4_n; + nonnegative_int batch_degree = 2_n; std::string mm_match = "mm_match"; std::string relu_match = "relu_match"; @@ -119,22 +122,22 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{size_t_from_int(batch_size), batch_degree}, - ShardParallelDim{size_t_from_int(in_channels), 1}, + ShardParallelDim{batch_size, batch_degree}, + ShardParallelDim{in_channels, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, }); t = b.dense(t, - /*outDim=*/16, + /*outDim=*/16_n, /*activation=*/std::nullopt); t = b.gelu(t); t = b.dense(t, - /*outDim=*/12, + /*outDim=*/12_n, /*activation=*/std::nullopt, /*use_bias=*/false, /*data_type=*/DataType::FLOAT, @@ -144,7 +147,7 @@ TEST_SUITE(FF_TEST_SUITE) { t = b.relu(t, /*name=*/relu_match); t = b.dense(t, - /*outDim=*/8, + /*outDim=*/8_n, /*activation=*/Activation::RELU); return sub_pcg_from_full_pcg(b.pcg); @@ -186,10 +189,10 @@ TEST_SUITE(FF_TEST_SUITE) { result_input_map = result.second.input_mapping; LinearAttrs correct_result_fused_mm_relu_attrs = LinearAttrs{ - 12, + /*out_channels=*/12_n, /*use_bias=*/false, - DataType::FLOAT, - Activation::RELU, + /*data_type=*/DataType::FLOAT, + /*activation=*/Activation::RELU, /*regularizer=*/std::nullopt, }; @@ -228,7 +231,7 @@ TEST_SUITE(FF_TEST_SUITE) { result_i_activation.raw_dataflow_graph_input, DataflowInput{ result_fused_mm_relu_node.raw_graph_node, - 0, + 0_n, }, }, }, @@ -239,7 +242,7 @@ TEST_SUITE(FF_TEST_SUITE) { result_i_weights.raw_dataflow_graph_input, DataflowInput{ result_fused_mm_relu_node.raw_graph_node, - 1, + 1_n, }, }, }, diff --git a/lib/substitutions/test/src/substitutions/substitution_internal/perform_shape_inference.cc b/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc similarity index 78% rename from lib/substitutions/test/src/substitutions/substitution_internal/perform_shape_inference.cc rename to lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc index 4d4e557fb8..950e833771 100644 --- a/lib/substitutions/test/src/substitutions/substitution_internal/perform_shape_inference.cc +++ b/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc @@ -1,4 +1,4 @@ -#include "substitutions/substitution_internal/perform_shape_inference.h" +#include "substitutions/apply_substitution/perform_shape_inference.h" #include "op-attrs/ops/element_unary.h" #include "op-attrs/ops/linear.h" #include "op-attrs/parallel_tensor_shape.h" @@ -18,21 +18,21 @@ TEST_SUITE(FF_TEST_SUITE) { UnorderedSetLabelledOpenDataflowGraph>(); - int in_channels = 24; - int out_channels = 16; - int batch_size = 4; - int batch_degree = 2; + nonnegative_int in_channels = 24_n; + nonnegative_int out_channels = 16_n; + nonnegative_int batch_size = 4_n; + nonnegative_int batch_degree = 2_n; DataflowGraphInput i0 = g.add_input({}); ParallelTensorShape i0_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{size_t_from_int(batch_size), batch_degree}, - ShardParallelDim{size_t_from_int(in_channels), 1}, + ShardParallelDim{batch_size, batch_degree}, + ShardParallelDim{in_channels, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -40,28 +40,28 @@ TEST_SUITE(FF_TEST_SUITE) { bool use_bias = false; LinearAttrs n1_op_attrs = LinearAttrs{ - out_channels, - use_bias, - DataType::FLOAT, - std::nullopt, - std::nullopt, + /*out_channels=*/out_channels, + /*use_bias=*/use_bias, + /*data_type=*/DataType::FLOAT, + /*activation=*/std::nullopt, + /*regularizer=*/std::nullopt, }; ParallelLayerAttrs n1_attrs = ParallelLayerAttrs{ - PCGOperatorAttrs{ + /*op_attrs=*/PCGOperatorAttrs{ n1_op_attrs, }, - std::nullopt, + /*name=*/std::nullopt, }; ElementUnaryAttrs n2_op_attrs = ElementUnaryAttrs{ - OperatorType::RELU, - std::nullopt, + /*op_type=*/OperatorType::RELU, + /*scalar=*/std::nullopt, }; ParallelLayerAttrs n2_attrs = ParallelLayerAttrs{ - PCGOperatorAttrs{ + /*op_attrs=*/PCGOperatorAttrs{ n2_op_attrs, }, - std::nullopt, + /*name=*/std::nullopt, }; ParallelTensorShape n1_output_shape = @@ -131,22 +131,22 @@ TEST_SUITE(FF_TEST_SUITE) { OpenDataflowEdge{ DataflowInputEdge{ i0, - DataflowInput{n1, 0}, + DataflowInput{n1, 0_n}, }, }, OpenDataflowEdge{DataflowEdge{ - DataflowOutput{n1_weight_node, 0}, - DataflowInput{n1_weight_replicate_node, 0}, + DataflowOutput{n1_weight_node, 0_n}, + DataflowInput{n1_weight_replicate_node, 0_n}, }}, OpenDataflowEdge{ DataflowEdge{ - DataflowOutput{n1_weight_replicate_node, 0}, - DataflowInput{n1, 1}, + DataflowOutput{n1_weight_replicate_node, 0_n}, + DataflowInput{n1, 1_n}, }, }, OpenDataflowEdge{DataflowEdge{ - DataflowOutput{n1, 0}, - DataflowInput{n2, 0}, + DataflowOutput{n1, 0_n}, + DataflowInput{n2, 0_n}, }}, }, {i0}, @@ -155,19 +155,20 @@ TEST_SUITE(FF_TEST_SUITE) { i0_shape, }, { - OpenDataflowValue{DataflowOutput{n1_weight_node, 0}}, + OpenDataflowValue{DataflowOutput{n1_weight_node, 0_n}}, lift_to_parallel(get_reduced_shape(n1_weight_shape)), }, { - OpenDataflowValue{DataflowOutput{n1_weight_replicate_node, 0}}, + OpenDataflowValue{ + DataflowOutput{n1_weight_replicate_node, 0_n}}, n1_weight_shape, }, { - OpenDataflowValue{DataflowOutput{n1, 0}}, + OpenDataflowValue{DataflowOutput{n1, 0_n}}, n1_output_shape, }, { - OpenDataflowValue{DataflowOutput{n2, 0}}, + OpenDataflowValue{DataflowOutput{n2, 0_n}}, n2_output_shape, }}}; diff --git a/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc b/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc index 95b61e0ef4..24f9e9bd56 100644 --- a/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc +++ b/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc @@ -6,7 +6,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_attribute(LinearAttrs, OperatorAttributeKey)") { - int out_channels = 16; + nonnegative_int out_channels = 16_n; bool use_bias = true; std::optional activation = Activation::GELU; std::optional regularizer = RegularizerAttrs{ diff --git a/lib/substitutions/test/src/substitutions/pcg_pattern.cc b/lib/substitutions/test/src/substitutions/pcg_pattern.cc index d9273b4bcf..9ff368a8eb 100644 --- a/lib/substitutions/test/src/substitutions/pcg_pattern.cc +++ b/lib/substitutions/test/src/substitutions/pcg_pattern.cc @@ -15,19 +15,19 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("find_pattern_matches(PCGPattern, SubParallelComputationGraph)") { ParallelComputationGraphBuilder builder; - size_t batch_size = 16; - int batch_degree = 2; - size_t num_channels = 24; + nonnegative_int batch_size = 16_n; + nonnegative_int batch_degree = 2_n; + nonnegative_int num_channels = 24_n; ParallelTensorShape a_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ ShardParallelDim{batch_size, batch_degree}, - ShardParallelDim{num_channels, 1}, + ShardParallelDim{num_channels, 1_n}, }, ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, + SumDegree{1_n}, + DiscardCopyDegree{1_n}, }, }, DataType::FLOAT, @@ -37,7 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t a_tensor = builder.create_input_tensor(a_shape, CreateGrad::YES, a_name); - int outDim = 16; + nonnegative_int outDim = 16_n; std::string x_matmul_name = "x_matmul"; std::string y_matmul_name = "y_matmul"; parallel_tensor_guid_t t0 = diff --git a/lib/substitutions/test/src/substitutions/substitution.cc b/lib/substitutions/test/src/substitutions/substitution.cc index 1718b03b5c..ef27cb7606 100644 --- a/lib/substitutions/test/src/substitutions/substitution.cc +++ b/lib/substitutions/test/src/substitutions/substitution.cc @@ -4,226 +4,173 @@ #include "substitutions/operator_pattern/operator_attribute_constraint.h" #include "substitutions/output_graph/output_graph_expr_node.dtg.h" #include "substitutions/output_graph/output_operator_attrs_assignment.h" +#include "substitutions/pcg_pattern.h" #include "substitutions/pcg_pattern_builder.h" #include "substitutions/sub_parallel_computation_graph.h" +#include "substitutions/substitution_builder.h" #include "substitutions/tensor_pattern/tensor_attribute_pattern.h" #include "utils/containers/get_only.h" #include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h" #include "utils/graph/labelled_open_dataflow_graph/algorithms/get_graph_data.h" +#include "utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h" #include "utils/integer_conversions.h" #include using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - // TEST_CASE("is_valid_substitution") { - // FAIL("TODO"); - // } - - TEST_CASE("evaluate_substitution_output(SubParallelComputationGraph, " - "Substitution, PCGPatternMatch)") { - // Currently Substitution creation is very verbose. - // This is being addressed in - // https://github.com/flexflow/FlexFlow/issues/1473. - auto pattern_g = LabelledOpenDataflowGraph:: - create>(); - - PatternInput pattern_i_activation = - PatternInput{pattern_g.add_input(tensor_attribute_pattern_match_all())}; - PatternInput pattern_i_weights = - PatternInput{pattern_g.add_input(tensor_attribute_pattern_match_all())}; - - OperatorAttributePattern mm_pattern = OperatorAttributePattern{{ - op_type_equals_constraint(OperatorType::LINEAR), - op_attr_key_equals( - OperatorAttributeKey::ACTIVATION, - OperatorAttributeValue{std::optional{std::nullopt}}), - }}; - NodeAddedResult mm_added = pattern_g.add_node( - mm_pattern, - {OpenDataflowValue{pattern_i_activation.raw_dataflow_graph_input}, - OpenDataflowValue{pattern_i_weights.raw_dataflow_graph_input}}, - {tensor_attribute_pattern_match_all()}); - PatternNode pattern_mm_node = PatternNode{mm_added.node}; - DataflowOutput mm_output = get_only(mm_added.outputs); - - OperatorAttributePattern relu_pattern = OperatorAttributePattern{{ - op_type_equals_constraint(OperatorType::RELU), - }}; - NodeAddedResult relu_added = - pattern_g.add_node(relu_pattern, - {OpenDataflowValue{mm_output}}, - {tensor_attribute_pattern_match_all()}); - PatternNode pattern_relu_node = PatternNode{relu_added.node}; - DataflowOutput relu_output = get_only(relu_added.outputs); - - LabelledOpenDataflowGraph - output_g = LabelledOpenDataflowGraph:: - create>(); - - OutputGraphExprInput output_i_activation = - OutputGraphExprInput{output_g.add_input({})}; - OutputGraphExprInput output_i_weights = - OutputGraphExprInput{output_g.add_input({})}; - - OutputOperatorAttrsAssignment fused_mm_relu_attrs_assignment = - OutputOperatorAttrsAssignment{{ - set_attr_to_constant(OperatorAttributeKey::OP_TYPE, - OperatorAttributeValue{OperatorType::LINEAR}), - copy_attr_from_pattern_node(OperatorAttributeKey::OUT_CHANNELS, - pattern_mm_node), - copy_attr_from_pattern_node(OperatorAttributeKey::USE_BIAS, - pattern_mm_node), - copy_attr_from_pattern_node(OperatorAttributeKey::DATA_TYPE, - pattern_mm_node), - set_attr_to_constant(OperatorAttributeKey::ACTIVATION, - OperatorAttributeValue{Activation::RELU}), - copy_attr_from_pattern_node(OperatorAttributeKey::REGULARIZER, - pattern_mm_node), + TEST_CASE("is_isomorphic_to(Substitution, Substitution)") { + auto make_substitution = [] { + SubstitutionBuilder b; + + auto [p_input, o_input] = + b.add_input(tensor_attribute_pattern_match_all()); + auto [p_weight, o_weight] = + b.add_input(tensor_attribute_pattern_match_all()); + + PatternValue p_mm_output = [&] { + auto pattern = OperatorAttributePattern{{ + op_type_equals_constraint(OperatorType::LINEAR), + op_attr_key_equals(OperatorAttributeKey::ACTIVATION, + OperatorAttributeValue{ + std::optional{std::nullopt}}), }}; - NodeAddedResult fused_mm_relu_added = output_g.add_node( - fused_mm_relu_attrs_assignment, - {OpenDataflowValue{output_i_activation.raw_dataflow_graph_input}, - OpenDataflowValue{output_i_weights.raw_dataflow_graph_input}}, - {{}}); - OutputGraphExprNode fused_mm_relu_node = - OutputGraphExprNode{fused_mm_relu_added.node}; - DataflowOutput fused_mm_relu_output = get_only(fused_mm_relu_added.outputs); - - Substitution sub = Substitution{ - PCGPattern{pattern_g}, - OutputGraphExpr{output_g}, - bidict{ - { - pattern_i_activation, - output_i_activation, - }, - { - pattern_i_weights, - output_i_weights, - }, - }, - bidict{ + + return get_only( + b.add_pattern_node(pattern, + {p_input, p_weight}, + {tensor_attribute_pattern_match_all()}, + "mm")); + }(); + + PatternValue p_relu_output = [&] { + auto pattern = OperatorAttributePattern{{ + op_type_equals_constraint(OperatorType::RELU), + }}; + + return get_only( + b.add_pattern_node(pattern, + {p_mm_output}, + {tensor_attribute_pattern_match_all()}, + "relu")); + }(); + + OutputGraphExprValue o_fused_output = [&] { + auto node_expr = OutputOperatorAttrsAssignment{ + b.pattern_node_named("mm"), { - PatternNodeOutput{relu_output}, - OutputGraphExprNodeOutput{fused_mm_relu_output}, - }, - }, + set_attr_to_constant(OperatorAttributeKey::ACTIVATION, + OperatorAttributeValue{Activation::RELU}), + }}; + + return get_only(b.add_output_graph_node( + node_expr, {o_input, o_weight}, nonnegative_int{1})); + }(); + + b.equate_outputs(p_relu_output, o_fused_output); + + return b.get_substitution(); }; - int in_channels = 24; - int batch_size = 4; - int batch_degree = 2; - std::string mm_match = "mm_match"; - std::string relu_match = "relu_match"; - - SubParallelComputationGraph pcg = [&] { - ParallelComputationGraphBuilder b; - parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{ - ParallelTensorDims{ - FFOrdered{ - ShardParallelDim{size_t_from_int(batch_size), batch_degree}, - ShardParallelDim{size_t_from_int(in_channels), 1}, - }, - ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, - }, - }, - DataType::FLOAT, - }); - t = b.dense(t, - /*outDim=*/16, - /*activation=*/std::nullopt); - t = b.gelu(t); - t = b.dense(t, - /*outDim=*/12, - /*activation=*/std::nullopt, - /*use_bias=*/false, - /*data_type=*/DataType::FLOAT, - /*kernel_initializer=*/std::nullopt, - /*bias_initializer=*/std::nullopt, - /*name=*/mm_match); - t = b.relu(t, - /*name=*/relu_match); - t = b.dense(t, - /*outDim=*/8, - /*activation=*/Activation::RELU); - - return sub_pcg_from_full_pcg(b.pcg); + Substitution sub1 = make_substitution(); + Substitution sub2 = make_substitution(); + + CHECK(is_isomorphic_to(sub1, sub1)); + CHECK(is_isomorphic_to(sub1, sub2)); + } + + TEST_CASE("is_valid_substitution") { + SubstitutionBuilder b; + + auto [p_input, o_input] = b.add_input(tensor_attribute_pattern_match_all()); + auto [p_weight, o_weight] = + b.add_input(tensor_attribute_pattern_match_all()); + + PatternValue p_mm_output = [&] { + auto pattern = OperatorAttributePattern{{ + op_type_equals_constraint(OperatorType::LINEAR), + op_attr_key_equals( + OperatorAttributeKey::ACTIVATION, + OperatorAttributeValue{std::optional{std::nullopt}}), + }}; + + return get_only(b.add_pattern_node(pattern, + {p_input, p_weight}, + {tensor_attribute_pattern_match_all()}, + "mm")); }(); - PCGPatternMatch match = [&] { - parallel_layer_guid_t mm_match_layer = - get_parallel_layer_by_name(pcg, mm_match); - parallel_layer_guid_t relu_match_layer = - get_parallel_layer_by_name(pcg, relu_match); - open_parallel_tensor_guid_t mm_match_layer_input_activations = - get_layer_inputs(pcg, mm_match_layer).at(0); - open_parallel_tensor_guid_t mm_match_layer_input_weights = - get_layer_inputs(pcg, mm_match_layer).at(1); - - return PCGPatternMatch{ - bidict{ - {pattern_mm_node, mm_match_layer}, - {pattern_relu_node, relu_match_layer}, - }, - std::unordered_map{ - { - PatternInput{pattern_i_activation}, - mm_match_layer_input_activations, - }, - { - PatternInput{pattern_i_weights}, - mm_match_layer_input_weights, - }}, - }; + PatternValue p_relu_output = [&] { + auto pattern = OperatorAttributePattern{{ + op_type_equals_constraint(OperatorType::RELU), + }}; + + return get_only(b.add_pattern_node(pattern, + {p_mm_output}, + {tensor_attribute_pattern_match_all()}, + "relu")); }(); - SubParallelComputationGraph result = apply_substitution(pcg, sub, match); - - SubParallelComputationGraph correct = [&] { - ParallelComputationGraphBuilder b; - parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{ - ParallelTensorDims{ - FFOrdered{ - ShardParallelDim{size_t_from_int(batch_size), batch_degree}, - ShardParallelDim{size_t_from_int(in_channels), 1}, - }, - ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, - }, - }, - DataType::FLOAT, - }); - t = b.dense(t, - /*outDim=*/16, - /*activation=*/std::nullopt); - t = b.gelu(t); - t = b.dense(t, - /*outDim=*/12, - /*activation=*/Activation::RELU, - /*use_bias=*/false, - /*data_type=*/DataType::FLOAT, - /*kernel_initializer=*/std::nullopt, - /*bias_initializer=*/std::nullopt, - /*name=*/std::nullopt); - t = b.dense(t, - /*outDim=*/8, - /*activation=*/Activation::RELU); - - return sub_pcg_from_full_pcg(b.pcg); + OutputGraphExprValue o_fused_output = [&] { + auto node_expr = OutputOperatorAttrsAssignment{ + b.pattern_node_named("mm"), + { + set_attr_to_constant(OperatorAttributeKey::ACTIVATION, + OperatorAttributeValue{Activation::RELU}), + }}; + + return get_only(b.add_output_graph_node( + node_expr, {o_input, o_weight}, nonnegative_int{1})); }(); - // since the new nodes produced by the substitution have new ids, it's - // easier/more correct to check that the graphs are isomorphic rather than - // checking their exact graph data - CHECK(sub_pcgs_are_isomorphic(result, correct)); + b.equate_outputs(p_relu_output, o_fused_output); + + SUBCASE("pattern inputs != mapped inputs") { + Substitution sub = b.get_substitution(); + sub.pcg_pattern.raw_graph.add_input(tensor_attribute_pattern_match_all()); + CHECK_FALSE(is_valid_substitution(sub)); + } + + SUBCASE("output graph inputs != mapped inputs") { + Substitution sub = b.get_substitution(); + sub.output_graph_expr.raw_graph.add_input(std::monostate{}); + CHECK_FALSE(is_valid_substitution(sub)); + } + + SUBCASE("pattern has no nodes") { + // Could revamp this test to only trigger the + // get_nodes(sub.pcg_pattern).empty() case + Substitution sub = b.get_substitution(); + LabelledOpenDataflowGraph + zero_node_pattern = + LabelledOpenDataflowGraph:: + create>(); + sub.pcg_pattern = PCGPattern{zero_node_pattern}; + CHECK_FALSE(is_valid_substitution(sub)); + } + + SUBCASE("output graph has no nodes") { + // Could revamp this test to only trigger the + // get_nodes(sub.output_graph_expr).empty() case + Substitution sub = b.get_substitution(); + LabelledOpenDataflowGraph + zero_node_pattern = + LabelledOpenDataflowGraph:: + create>(); + sub.output_graph_expr = OutputGraphExpr{zero_node_pattern}; + CHECK_FALSE(is_valid_substitution(sub)); + } + + SUBCASE("valid substitution") { + Substitution sub = b.get_substitution(); + CHECK(is_valid_substitution(sub)); + } } } diff --git a/lib/substitutions/test/src/substitutions/substitution_builder.cc b/lib/substitutions/test/src/substitutions/substitution_builder.cc new file mode 100644 index 0000000000..028a4e59c9 --- /dev/null +++ b/lib/substitutions/test/src/substitutions/substitution_builder.cc @@ -0,0 +1,145 @@ +#include "substitutions/substitution_builder.h" +#include "substitutions/operator_pattern/operator_attribute_constraint.h" +#include "substitutions/output_graph/output_graph_expr_node.dtg.h" +#include "substitutions/output_graph/output_operator_attrs_assignment.h" +#include "substitutions/substitution.h" +#include "substitutions/tensor_pattern/tensor_attribute_pattern.h" +#include "utils/containers/get_only.h" +#include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("SubstitutionBuilder") { + OperatorAttributePattern relu_pattern = OperatorAttributePattern{{ + op_type_equals_constraint(OperatorType::RELU), + }}; + + OperatorAttributePattern mm_pattern = OperatorAttributePattern{{ + op_type_equals_constraint(OperatorType::LINEAR), + op_attr_key_equals( + OperatorAttributeKey::ACTIVATION, + OperatorAttributeValue{std::optional{std::nullopt}}), + }}; + + std::unordered_map + fused_mm_relu_attr_assignments = { + set_attr_to_constant(OperatorAttributeKey::ACTIVATION, + OperatorAttributeValue{Activation::RELU}), + }; + + Substitution correct = [&] { + auto pattern_g = LabelledOpenDataflowGraph:: + create< + UnorderedSetLabelledOpenDataflowGraph>(); + + PatternInput pattern_i_activation = PatternInput{ + pattern_g.add_input(tensor_attribute_pattern_match_all())}; + PatternInput pattern_i_weights = PatternInput{ + pattern_g.add_input(tensor_attribute_pattern_match_all())}; + + NodeAddedResult mm_added = pattern_g.add_node( + mm_pattern, + {OpenDataflowValue{pattern_i_activation.raw_dataflow_graph_input}, + OpenDataflowValue{pattern_i_weights.raw_dataflow_graph_input}}, + {tensor_attribute_pattern_match_all()}); + PatternNode pattern_mm_node = PatternNode{mm_added.node}; + DataflowOutput mm_output = get_only(mm_added.outputs); + + NodeAddedResult relu_added = + pattern_g.add_node(relu_pattern, + {OpenDataflowValue{mm_output}}, + {tensor_attribute_pattern_match_all()}); + PatternNode pattern_relu_node = PatternNode{relu_added.node}; + DataflowOutput relu_output = get_only(relu_added.outputs); + + LabelledOpenDataflowGraph + output_g = LabelledOpenDataflowGraph:: + create>(); + + OutputGraphExprInput output_i_activation = + OutputGraphExprInput{output_g.add_input({})}; + OutputGraphExprInput output_i_weights = + OutputGraphExprInput{output_g.add_input({})}; + + OutputOperatorAttrsAssignment fused_mm_relu_attrs_assignment = + OutputOperatorAttrsAssignment{ + pattern_mm_node, + fused_mm_relu_attr_assignments, + }; + NodeAddedResult fused_mm_relu_added = output_g.add_node( + fused_mm_relu_attrs_assignment, + {OpenDataflowValue{output_i_activation.raw_dataflow_graph_input}, + OpenDataflowValue{output_i_weights.raw_dataflow_graph_input}}, + {{}}); + OutputGraphExprNode fused_mm_relu_node = + OutputGraphExprNode{fused_mm_relu_added.node}; + DataflowOutput fused_mm_relu_output = + get_only(fused_mm_relu_added.outputs); + + return Substitution{ + PCGPattern{pattern_g}, + OutputGraphExpr{output_g}, + bidict{ + { + pattern_i_activation, + output_i_activation, + }, + { + pattern_i_weights, + output_i_weights, + }, + }, + bidict{ + { + PatternNodeOutput{relu_output}, + OutputGraphExprNodeOutput{fused_mm_relu_output}, + }, + }, + }; + }(); + + Substitution result = [&] { + SubstitutionBuilder b; + + auto [p_input, o_input] = + b.add_input(tensor_attribute_pattern_match_all()); + auto [p_weight, o_weight] = + b.add_input(tensor_attribute_pattern_match_all()); + + PatternValue p_mm_output = + get_only(b.add_pattern_node(mm_pattern, + {p_input, p_weight}, + {tensor_attribute_pattern_match_all()}, + "mm")); + + PatternValue p_relu_output = + get_only(b.add_pattern_node(relu_pattern, + {p_mm_output}, + {tensor_attribute_pattern_match_all()}, + "relu")); + + OutputOperatorAttrsAssignment fused_mm_relu_attrs_assignment = + OutputOperatorAttrsAssignment{ + b.pattern_node_named("mm"), + fused_mm_relu_attr_assignments, + }; + OutputGraphExprValue o_fused_output = + get_only(b.add_output_graph_node(fused_mm_relu_attrs_assignment, + {o_input, o_weight}, + nonnegative_int{1})); + + b.equate_outputs(p_relu_output, o_fused_output); + + return b.get_substitution(); + }(); + + CHECK(is_isomorphic_to(result, correct)); + } +} diff --git a/lib/substitutions/test/src/substitutions/unity_substitution_set.cc b/lib/substitutions/test/src/substitutions/unity_substitution_set.cc new file mode 100644 index 0000000000..804fa99bef --- /dev/null +++ b/lib/substitutions/test/src/substitutions/unity_substitution_set.cc @@ -0,0 +1,20 @@ +#include "substitutions/unity_substitution_set.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("get_substitution_set") { + MachineSpecification machine_spec = MachineSpecification{ + /*num_nodes=*/2_n, + /*num_cpus_per_node=*/8_n, + /*num_gpus_per_node=*/4_n, + /*inter_node_bandwidth=*/0.0, + /*intra_node_bandwidth=*/0.0, + }; + + std::vector result = get_substitution_set(machine_spec); + + CHECK(result.size() == 36); + } +} diff --git a/lib/substitutions/test/src/test_pattern_matches.cc b/lib/substitutions/test/src/substitutions/unlabelled/find_pattern_matches.cc similarity index 94% rename from lib/substitutions/test/src/test_pattern_matches.cc rename to lib/substitutions/test/src/substitutions/unlabelled/find_pattern_matches.cc index aeedd65f82..ab79ad6ff6 100644 --- a/lib/substitutions/test/src/test_pattern_matches.cc +++ b/lib/substitutions/test/src/substitutions/unlabelled/find_pattern_matches.cc @@ -9,7 +9,6 @@ #include "utils/graph/open_dataflow_graph/algorithms/get_subgraph.h" #include "utils/graph/open_dataflow_graph/algorithms/get_subgraph_inputs.h" #include "utils/graph/open_dataflow_graph/open_dataflow_graph.h" -#include "utils/overload.h" #include using namespace FlexFlow; @@ -59,30 +58,30 @@ namespace rc { // OpenMultiDiGraphView subgraph = // get_subgraph(as_openmultidigraph(g), // subgraph_nodes); - +// // std::vector matches = // find_pattern_matches(subgraph, as_openmultidigraph(g), AlwaysTrue{}); - +// // RC_ASSERT(!matches.empty()); - +// // for (MultiDiGraphPatternMatch const &match : matches) { // RC_ASSERT(pattern_matches(subgraph, as_openmultidigraph(g), match, // AlwaysTrue{})); // } // }); -// } TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("find_pattern_matches_small") { + TEST_CASE("find_pattern_matches") { OpenDataflowGraph pattern_graph = OpenDataflowGraph::create(); - NodeAddedResult pattern_n0_added = pattern_graph.add_node({}, 1); + NodeAddedResult pattern_n0_added = pattern_graph.add_node({}, 1_n); Node pattern_n0 = pattern_n0_added.node; OpenDataflowValue pattern_v0 = OpenDataflowValue{get_only(pattern_n0_added.outputs)}; - NodeAddedResult pattern_n1_added = pattern_graph.add_node({pattern_v0}, 1); + NodeAddedResult pattern_n1_added = + pattern_graph.add_node({pattern_v0}, 1_n); Node pattern_n1 = pattern_n1_added.node; OpenDataflowValue pattern_v1 = OpenDataflowValue{get_only(pattern_n1_added.outputs)}; @@ -94,19 +93,19 @@ TEST_SUITE(FF_TEST_SUITE) { OpenDataflowGraph graph = OpenDataflowGraph::create(); - NodeAddedResult n0_added = graph.add_node({}, 1); + NodeAddedResult n0_added = graph.add_node({}, 1_n); Node n0 = n0_added.node; OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)}; - NodeAddedResult n1_added = graph.add_node({v0}, 1); + NodeAddedResult n1_added = graph.add_node({v0}, 1_n); Node n1 = n1_added.node; OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)}; - NodeAddedResult n2_added = graph.add_node({v1}, 1); + NodeAddedResult n2_added = graph.add_node({v1}, 1_n); Node n2 = n2_added.node; OpenDataflowValue v2 = OpenDataflowValue{get_only(n2_added.outputs)}; - NodeAddedResult n3_added = graph.add_node({v2}, 1); + NodeAddedResult n3_added = graph.add_node({v2}, 1_n); Node n3 = n3_added.node; OpenDataflowValue v3 = OpenDataflowValue{get_only(n3_added.outputs)}; @@ -128,8 +127,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector n1_incoming = {OpenDataflowEdge{ DataflowEdge{ - DataflowOutput{n0, 0}, - DataflowInput{n1, 0}, + DataflowOutput{n0, 0_n}, + DataflowInput{n1, 0_n}, }, }}; @@ -201,7 +200,7 @@ TEST_SUITE(FF_TEST_SUITE) { OpenDataflowGraph::create(); DataflowGraphInput i0 = g.add_input(); - NodeAddedResult g_n0_added = g.add_node({OpenDataflowValue{i0}}, 1); + NodeAddedResult g_n0_added = g.add_node({OpenDataflowValue{i0}}, 1_n); Node g_n0 = g_n0_added.node; OpenDataflowValue g_v0 = OpenDataflowValue{get_only(g_n0_added.outputs)}; PatternNode g_p0 = PatternNode{g_n0}; diff --git a/lib/substitutions/test/src/substitutions/unlabelled/pattern_matching.cc b/lib/substitutions/test/src/substitutions/unlabelled/pattern_matching.cc new file mode 100644 index 0000000000..8fd468d186 --- /dev/null +++ b/lib/substitutions/test/src/substitutions/unlabelled/pattern_matching.cc @@ -0,0 +1,210 @@ +#include "substitutions/unlabelled/pattern_matching.h" +#include "substitutions/unlabelled/find_pattern_matches.h" +#include "substitutions/unlabelled/match_additional_criterion.h" +#include "utils/containers/get_only.h" +#include "utils/graph/instances/unordered_set_dataflow_graph.h" +#include "utils/graph/node/algorithms.h" +#include "utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.h" +#include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_values.h" +#include "utils/graph/open_dataflow_graph/algorithms/get_subgraph.h" +#include "utils/graph/open_dataflow_graph/algorithms/get_subgraph_inputs.h" +#include "utils/graph/open_dataflow_graph/open_dataflow_graph.h" +#include "utils/overload.h" +#include + +using namespace FlexFlow; + +namespace rc { + +// template <> +// struct Arbitrary { +// static int const MAX_GRAPH_SIZE = 200; +// static int const MAX_EDGE_SIZE = 1000; +// +// static Gen arbitrary() { +// return gen::exec([&] { +// int num_nodes = *gen::inRange(1, MAX_GRAPH_SIZE + 1); +// MultiDiGraph g = MultiDiGraph::template +// create(); +// +// std::vector nodes; +// for (int i = 0; i < num_nodes; ++i) { +// nodes.push_back(g.add_node()); +// } +// +// int num_edges = *gen::inRange(1, MAX_GRAPH_SIZE + 1); +// for (int i = 0; i < num_edges; ++i) { +// int src_id = *gen::inRange(0, num_nodes); +// int dst_id = *gen::inRange(0, num_nodes); +// if (src_id > dst_id) { +// std::swap(src_id, dst_id); +// } +// +// g.add_edge(MultiDiEdge{nodes[dst_id], +// g.add_node_port(), +// nodes[src_id], +// g.add_node_port()}); +// } +// +// return g; +// }); +// } +// }; + +} // namespace rc + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("find_pattern_matches") { + OpenDataflowGraph pattern_graph = + OpenDataflowGraph::create(); + + NodeAddedResult pattern_n0_added = pattern_graph.add_node({}, 1_n); + Node pattern_n0 = pattern_n0_added.node; + OpenDataflowValue pattern_v0 = + OpenDataflowValue{get_only(pattern_n0_added.outputs)}; + + NodeAddedResult pattern_n1_added = + pattern_graph.add_node({pattern_v0}, 1_n); + Node pattern_n1 = pattern_n1_added.node; + OpenDataflowValue pattern_v1 = + OpenDataflowValue{get_only(pattern_n1_added.outputs)}; + + UnlabelledGraphPattern pattern = UnlabelledGraphPattern{pattern_graph}; + PatternNode p0 = PatternNode{pattern_n0}; + PatternNode p1 = PatternNode{pattern_n1}; + + OpenDataflowGraph graph = + OpenDataflowGraph::create(); + + NodeAddedResult n0_added = graph.add_node({}, 1_n); + Node n0 = n0_added.node; + OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)}; + + NodeAddedResult n1_added = graph.add_node({v0}, 1_n); + Node n1 = n1_added.node; + OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)}; + + NodeAddedResult n2_added = graph.add_node({v1}, 1_n); + Node n2 = n2_added.node; + OpenDataflowValue v2 = OpenDataflowValue{get_only(n2_added.outputs)}; + + NodeAddedResult n3_added = graph.add_node({v2}, 1_n); + Node n3 = n3_added.node; + OpenDataflowValue v3 = OpenDataflowValue{get_only(n3_added.outputs)}; + + UnlabelledDataflowGraphPatternMatch match = + UnlabelledDataflowGraphPatternMatch{ + bidict{ + {p0, n0}, + {p1, n1}, + }, + bidict{}}; + + UnlabelledDataflowGraphPatternMatch invalid_match = + UnlabelledDataflowGraphPatternMatch{ + bidict{ + {p0, n1}, + {p1, n2}, + }, + bidict{}}; + + std::vector n1_incoming = {OpenDataflowEdge{ + DataflowEdge{ + DataflowOutput{n0, 0_n}, + DataflowInput{n1, 0_n}, + }, + }}; + + SUBCASE("get_incoming_edges") { + SUBCASE("n0") { + std::vector result = get_incoming_edges(graph, n0); + std::vector correct = {}; + CHECK(result == correct); + } + SUBCASE("n1") { + std::vector result = get_incoming_edges(graph, n1); + std::vector correct = n1_incoming; + CHECK(result == correct); + } + SUBCASE("both") { + std::unordered_map> result = + get_incoming_edges(graph, {n0, n1}); + std::unordered_map> correct = { + {n0, {}}, {n1, n1_incoming}}; + CHECK(result == correct); + } + } + + SUBCASE("get_subgraph_inputs") { + std::unordered_set result = + get_subgraph_inputs(graph, {n0, n1}); + std::unordered_set correct = {}; + CHECK(result == correct); + } + + SUBCASE("get_subgraph") { + OpenDataflowGraphView g = get_subgraph(graph, {n0, n1}).graph; + SUBCASE("nodes") { + std::unordered_set result = get_nodes(g); + std::unordered_set correct = {n0, n1}; + CHECK(result == correct); + } + SUBCASE("inputs") { + std::unordered_set result = g.get_inputs(); + std::unordered_set correct = {}; + CHECK(result == correct); + } + SUBCASE("get_open_dataflow_values") { + std::unordered_set values = + get_open_dataflow_values(g); + CHECK(values.size() == 2); + } + } + + SUBCASE("subgraph_matched") { + OpenDataflowGraphView result = subgraph_matched(graph, match).graph; + std::unordered_set result_nodes = get_nodes(result); + std::unordered_set correct_nodes = {n0, n1}; + CHECK(result_nodes == correct_nodes); + } + + SUBCASE("unlabelled_pattern_does_match") { + CHECK(unlabelled_pattern_does_match( + pattern, graph, match, match_additional_crition_always_true())); + CHECK_FALSE(unlabelled_pattern_does_match( + pattern, + graph, + invalid_match, + match_additional_crition_always_true())); + } + + SUBCASE("unlabelled_pattern_does_match") { + OpenDataflowGraph g = + OpenDataflowGraph::create(); + DataflowGraphInput i0 = g.add_input(); + + NodeAddedResult g_n0_added = g.add_node({OpenDataflowValue{i0}}, 1_n); + Node g_n0 = g_n0_added.node; + OpenDataflowValue g_v0 = OpenDataflowValue{get_only(g_n0_added.outputs)}; + PatternNode g_p0 = PatternNode{g_n0}; + PatternInput g_pi0 = PatternInput{i0}; + + UnlabelledGraphPattern open_pattern = UnlabelledGraphPattern{g}; + + UnlabelledDataflowGraphPatternMatch open_match = + UnlabelledDataflowGraphPatternMatch{ + bidict{ + {g_p0, n1}, + }, + bidict{ + {g_pi0, v0}, + }}; + + CHECK(unlabelled_pattern_does_match( + open_pattern, + graph, + open_match, + match_additional_crition_always_true())); + } + } +} diff --git a/lib/substitutions/test/src/substitutions/unlabelled/pattern_split.cc b/lib/substitutions/test/src/substitutions/unlabelled/pattern_split.cc index e4d763d9c3..1bddb9f680 100644 --- a/lib/substitutions/test/src/substitutions/unlabelled/pattern_split.cc +++ b/lib/substitutions/test/src/substitutions/unlabelled/pattern_split.cc @@ -13,11 +13,11 @@ TEST_SUITE(FF_TEST_SUITE) { OpenDataflowGraph g = OpenDataflowGraph::create(); - NodeAddedResult n0_added = g.add_node({}, 1); + NodeAddedResult n0_added = g.add_node({}, 1_n); Node n0 = n0_added.node; OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)}; - NodeAddedResult n1_added = g.add_node({v0}, 1); + NodeAddedResult n1_added = g.add_node({v0}, 1_n); Node n1 = n1_added.node; OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)}; @@ -77,11 +77,11 @@ TEST_SUITE(FF_TEST_SUITE) { DataflowGraphInput i0 = g.add_input(); DataflowGraphInput i1 = g.add_input(); - NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1); + NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1_n); Node n0 = n0_added.node; OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)}; - NodeAddedResult n1_added = g.add_node({OpenDataflowValue{i1}}, 1); + NodeAddedResult n1_added = g.add_node({OpenDataflowValue{i1}}, 1_n); Node n1 = n1_added.node; OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)}; diff --git a/lib/substitutions/test/src/substitutions/unlabelled/unlabelled_graph_pattern.cc b/lib/substitutions/test/src/substitutions/unlabelled/unlabelled_graph_pattern.cc index e0805dbfd4..22d1b8a2a5 100644 --- a/lib/substitutions/test/src/substitutions/unlabelled/unlabelled_graph_pattern.cc +++ b/lib/substitutions/test/src/substitutions/unlabelled/unlabelled_graph_pattern.cc @@ -17,7 +17,7 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK_FALSE(is_singleton_pattern(pattern)); } - NodeAddedResult n0_added = g.add_node({}, 1); + NodeAddedResult n0_added = g.add_node({}, 1_n); OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)}; SUBCASE("1 node") { @@ -26,7 +26,7 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(is_singleton_pattern(pattern)); } - NodeAddedResult n1_added = g.add_node({v0}, 1); + NodeAddedResult n1_added = g.add_node({v0}, 1_n); OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)}; SUBCASE("more than 1 node") { diff --git a/lib/substitutions/test/src/test_substitution.cc b/lib/substitutions/test/src/test_substitution.cc deleted file mode 100644 index dcb06a78fa..0000000000 --- a/lib/substitutions/test/src/test_substitution.cc +++ /dev/null @@ -1,148 +0,0 @@ -#include "doctest/doctest.h" -#include "op-attrs/get_op_type.h" -#include "rapidcheck.h" -#include "substitutions/substitution.h" - -using namespace FlexFlow; - -// TEST_SUITE(FF_TEST_SUITE) { -// TEST_CASE("substitution") { -// PCGPattern pattern; -// OutputGraphExpr output_expr; -// bidict{ -// OperatorAttributeConstraint{ConstraintType::EQUAL, -// OperatorAttributeKey::OP_TYPE, -// OperatorType::LINEAR}}}; -// -// ParallelTensorPattern tensor_pattern_e0{ -// std::vector{ -// TensorAttributeConstraint{ConstraintType::EQUAL, -// ListIndexAccess{ -// TensorAttributeKey::DIM_SIZES, 0}, -// 2}}}; -// -// ParallelTensorPattern tensor_pattern_empty{ -// std::vector{}}; -// -// auto ig = -// OutputLabelledOpenMultiDiGraph:: -// create>(); -// Node n0 = ig.add_node(operator_pattern_n0); -// NodePort p0 = ig.add_node_port(); -// InputMultiDiEdge e0{n0, p0, std::make_pair(p0.value(), p0.value())}; -// ig.add_edge(e0); -// ig.add_label(e0, tensor_pattern_e0); -// -// RC_ASSERT(get_nodes(ig).size() == 1); -// RC_ASSERT(get_edges(ig).size() == 1); -// -// GraphPattern input_graph{ig}; -// -// OperatorAttrAssignment op_ass_n1{ -// {{OperatorAttributeKey::OP_TYPE, -// AttrConstant{OperatorType::REPARTITION}}, -// {OperatorAttributeKey::PARALLEL_DIM, -// AttrConstant{ff_dim_t{nonnegative_int{0}}}}, -// {OperatorAttributeKey::PARALLEL_DEGREE, AttrConstant{2}}}}; -// -// OperatorAttrAssignment op_ass_n2{ -// {{OperatorAttributeKey::OP_TYPE, AttrConstant{OperatorType::LINEAR}}, -// {OperatorAttributeKey::OUT_CHANNELS, -// OperatorAttrAccess{n0, OperatorAttributeKey::OUT_CHANNELS}}, -// {OperatorAttributeKey::USE_BIAS, -// OperatorAttrAccess{n0, OperatorAttributeKey::USE_BIAS}}, -// {OperatorAttributeKey::DATA_TYPE, -// OperatorAttrAccess{n0, OperatorAttributeKey::DATA_TYPE}}, -// {OperatorAttributeKey::ACTIVATION, -// OperatorAttrAccess{n0, OperatorAttributeKey::ACTIVATION}}, -// {OperatorAttributeKey::REGULARIZER, -// OperatorAttrAccess{n0, OperatorAttributeKey::REGULARIZER}}}}; -// -// OperatorAttrAssignment op_ass_n3{ -// {{OperatorAttributeKey::OP_TYPE, -// AttrConstant{OperatorType::REDUCTION}}, -// {OperatorAttributeKey::PARALLEL_DIM, -// AttrConstant{ff_dim_t{nonnegative_int{0}}}}, -// {OperatorAttributeKey::PARALLEL_DEGREE, AttrConstant{2}}}}; -// -// auto og = NodeLabelledOpenMultiDiGraph::create< -// UnorderedNodeLabelledOpenMultiDiGraph>(); -// Node n1 = og.add_node(op_ass_n1); -// Node n2 = og.add_node(op_ass_n2); -// Node n3 = og.add_node(op_ass_n3); -// NodePort p1 = og.add_node_port(); -// NodePort p2 = og.add_node_port(); -// NodePort p3 = og.add_node_port(); -// InputMultiDiEdge e1{n1, p1, {p1.value(), p1.value()}}; -// MultiDiEdge e2{n2, p2, n1, p1}; -// MultiDiEdge e3{n3, p3, n2, p2}; -// og.add_edge(e1); -// og.add_edge(e2); -// og.add_edge(e3); -// OutputGraphExpr output_graph_expr{og}; -// -// RC_ASSERT(get_nodes(og).size() == 3); -// RC_ASSERT(get_edges(og).size() == 3); -// -// bidict input_mapping; -// input_mapping.equate(e0, e1); -// bidict output_mapping; -// -// Substitution substitution{ -// input_graph, output_graph_expr, input_mapping, output_mapping}; -// -// SubParallelComputationGraph pcg = -// OutputLabelledOpenMultiDiGraph::create< -// UnorderedOutputLabelledOpenMultiDiGraph>(); -// -// Node n4 = pcg.add_node(Operator{InputAttrs{}, "input"}); -// Node n5 = pcg.add_node(Operator{ -// LinearAttrs{1, false, DataType::FLOAT, Activation::RELU, -// std::nullopt}, "linear"}); -// NodePort p4 = pcg.add_node_port(); -// NodePort p5 = pcg.add_node_port(); -// -// MultiDiEdge e4{n5, p5, n4, p4}; -// pcg.add_edge(e4); -// ParallelDim dim = {2, 1, false}; -// ParallelTensorDims dims = {FFOrdered{dim}}; -// pcg.add_label(e4, ParallelTensor(dims, DataType::FLOAT, -// CreateGrad::YES)); -// -// MatchAdditionalCriterion criterion{ -// [&](Node const &pattern_node, Node const &graph_node) { -// return operator_satisfies(pcg.at(graph_node), -// input_graph.value().at(pattern_node)); -// }, -// [&](OpenMultiDiEdge const &pattern_edge, -// OpenMultiDiEdge const &graph_edge) { -// return parallel_tensor_satisfies( -// pcg.at(graph_edge), input_graph.value().at(pattern_edge)); -// }}; -// -// RC_ASSERT(criterion.node_criterion(n0, n5)); -// -// std::vector matches = -// find_pattern_matches(input_graph, pcg, criterion); -// -// RC_ASSERT(matches.size() == 1); -// -// SubParallelComputationGraph new_pcg = -// apply_substitution(pcg, substitution, matches[0]); -// -// RC_ASSERT(get_nodes(new_pcg).size() == 4); -// RC_ASSERT(get_edges(new_pcg).size() == 3); -// } -// } diff --git a/lib/utils/include/utils/bidict/algorithms/bidict_from_enumerating.h b/lib/utils/include/utils/bidict/algorithms/bidict_from_enumerating.h index 86ef6c4b4d..83afc32e0c 100644 --- a/lib/utils/include/utils/bidict/algorithms/bidict_from_enumerating.h +++ b/lib/utils/include/utils/bidict/algorithms/bidict_from_enumerating.h @@ -2,14 +2,16 @@ #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_BIDICT_ALGORITHMS_BIDICT_FROM_ENUMERATING_H #include "utils/bidict/bidict.h" +#include "utils/nonnegative_int/nonnegative_int.h" #include namespace FlexFlow { template -bidict bidict_from_enumerating(std::unordered_set const &s) { - bidict result; - int idx = 0; +bidict + bidict_from_enumerating(std::unordered_set const &s) { + bidict result; + nonnegative_int idx = 0_n; for (T const &t : s) { result.equate(idx, t); idx++; @@ -19,9 +21,9 @@ bidict bidict_from_enumerating(std::unordered_set const &s) { } template -bidict bidict_from_enumerating(std::set const &s) { - bidict result; - int idx = 0; +bidict bidict_from_enumerating(std::set const &s) { + bidict result; + nonnegative_int idx = 0_n; for (T const &t : s) { result.equate(idx, t); idx++; diff --git a/lib/utils/include/utils/cli/cli_flag_key.struct.toml b/lib/utils/include/utils/cli/cli_flag_key.struct.toml index 790a752911..9c02fddc3e 100644 --- a/lib/utils/include/utils/cli/cli_flag_key.struct.toml +++ b/lib/utils/include/utils/cli/cli_flag_key.struct.toml @@ -6,8 +6,10 @@ features = [ "fmt", ] -includes = [] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] [[fields]] name = "raw_idx" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/utils/include/utils/cli/cli_positional_argument_key.struct.toml b/lib/utils/include/utils/cli/cli_positional_argument_key.struct.toml index d571d0deb3..4c50c277c0 100644 --- a/lib/utils/include/utils/cli/cli_positional_argument_key.struct.toml +++ b/lib/utils/include/utils/cli/cli_positional_argument_key.struct.toml @@ -6,8 +6,10 @@ features = [ "fmt", ] -includes = [] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] [[fields]] name = "raw_idx" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/utils/include/utils/containers/at_idx.h b/lib/utils/include/utils/containers/at_idx.h index 757da5c548..fdc13a0231 100644 --- a/lib/utils/include/utils/containers/at_idx.h +++ b/lib/utils/include/utils/containers/at_idx.h @@ -1,17 +1,18 @@ #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_AT_IDX_H #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_AT_IDX_H +#include "utils/nonnegative_int/nonnegative_int.h" #include #include namespace FlexFlow { template -std::optional at_idx(std::vector const &v, size_t idx) { +std::optional at_idx(std::vector const &v, nonnegative_int idx) { if (idx >= v.size()) { return std::nullopt; } else { - return v.at(idx); + return v.at(idx.unwrap_nonnegative()); } } diff --git a/lib/utils/include/utils/containers/enumerate.h b/lib/utils/include/utils/containers/enumerate.h index e3722e52c6..1e8bc1f3dc 100644 --- a/lib/utils/include/utils/containers/enumerate.h +++ b/lib/utils/include/utils/containers/enumerate.h @@ -11,14 +11,14 @@ namespace FlexFlow { /** * @brief Generate a map from indices to elements of \p c. * - * @note We return a std::map rather than a - * std::vector> for consistency + * @note We return a std::map rather than a + * std::vector> for consistency * with enumerate(FFOrdered const &). Note that std::map * provides ordered iteration in increasing order, so iterating through * the result of this function should still function as expected. */ template -std::map enumerate(std::vector const &c) { +std::map enumerate(std::vector const &c) { return enumerate_vector(c); } @@ -27,16 +27,16 @@ std::map enumerate(std::vector const &c) { * return a map from indices of this ordering to elements of \p c. * - * @note We return a std::map rather than a - * std::vector> for consistency + * @note We return a std::map rather than a + * std::vector> for consistency * with enumerate(FFOrdered const &). Note that std::map * provides ordered iteration in increasing order, so iterating through * the result of this function should still function as expected. */ template -std::map enumerate(std::unordered_set const &c) { - std::map result; - int idx = 0; +std::map enumerate(std::unordered_set const &c) { + std::map result; + nonnegative_int idx = 0_n; for (auto const &v : c) { result.insert({idx++, v}); } diff --git a/lib/utils/include/utils/containers/enumerate_vector.h b/lib/utils/include/utils/containers/enumerate_vector.h index 700106ea3f..1e66279306 100644 --- a/lib/utils/include/utils/containers/enumerate_vector.h +++ b/lib/utils/include/utils/containers/enumerate_vector.h @@ -1,16 +1,19 @@ #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_ENUMERATE_VECTOR_H #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_ENUMERATE_VECTOR_H +#include "utils/nonnegative_int/nonnegative_int.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/nonnegative_int/num_elements.h" #include #include namespace FlexFlow { template -std::map enumerate_vector(std::vector const &v) { - std::map result; - for (int i = 0; i < v.size(); i++) { - result.insert({i, v.at(i)}); +std::map enumerate_vector(std::vector const &v) { + std::map result; + for (nonnegative_int i : nonnegative_range(num_elements(v))) { + result.insert({i, v.at(i.unwrap_nonnegative())}); } return result; } diff --git a/lib/utils/include/utils/containers/flatmap.h b/lib/utils/include/utils/containers/flatmap.h index b016a1e03d..a7848b88aa 100644 --- a/lib/utils/include/utils/containers/flatmap.h +++ b/lib/utils/include/utils/containers/flatmap.h @@ -4,6 +4,7 @@ #include "utils/containers/extend.h" #include "utils/containers/get_element_type.h" #include "utils/containers/merge_maps.h" +#include #include #include @@ -52,7 +53,19 @@ std::unordered_map flatmap(std::unordered_map const &m, std::unordered_map result; for (auto const &[k, v] : m) { - result = merge_maps(result, f(k, v)); + result = merge_disjoint_maps(result, f(k, v)); + } + + return result; +} + +template +std::string flatmap(std::string const &input, F const &f) { + std::string result = ""; + + for (char c : input) { + std::string for_c = f(c); + result += for_c; } return result; diff --git a/lib/utils/include/utils/containers/get_all_permutations_with_repetition.h b/lib/utils/include/utils/containers/get_all_permutations_with_repetition.h index ccdde0131a..0a7e9d16c2 100644 --- a/lib/utils/include/utils/containers/get_all_permutations_with_repetition.h +++ b/lib/utils/include/utils/containers/get_all_permutations_with_repetition.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_GET_ALL_PERMUTATIONS_WITH_REPETITION_H #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_GET_ALL_PERMUTATIONS_WITH_REPETITION_H +#include "utils/nonnegative_int/nonnegative_int.h" #include #include @@ -14,7 +15,8 @@ namespace FlexFlow { **/ template std::unordered_multiset> - get_all_permutations_with_repetition(C const &container, int n) { + get_all_permutations_with_repetition(C const &container, + nonnegative_int n) { std::unordered_multiset> result; if (container.empty() || n == 0) { @@ -22,16 +24,16 @@ std::unordered_multiset> } std::vector elements(std::begin(container), std::end(container)); - std::vector indices(n, 0); + std::vector indices(n.unwrap_nonnegative(), 0); while (true) { - std::vector perm(n); + std::vector perm(n.unwrap_nonnegative()); for (int i = 0; i < n; ++i) { perm[i] = elements[indices[i]]; } result.insert(perm); - int i = n - 1; + int i = n.unwrap_nonnegative() - 1; while (i != -1 && ++indices[i] == elements.size()) { indices[i] = 0; --i; diff --git a/lib/utils/include/utils/containers/make.h b/lib/utils/include/utils/containers/make.h new file mode 100644 index 0000000000..f7b15dfa02 --- /dev/null +++ b/lib/utils/include/utils/containers/make.h @@ -0,0 +1,13 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_MAKE_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_MAKE_H + +namespace FlexFlow { + +template +decltype(auto) make() { + return [](auto const &x) { return T{x}; }; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/containers/merge_maps.h b/lib/utils/include/utils/containers/merge_maps.h index dd886ab8aa..bfc2446d99 100644 --- a/lib/utils/include/utils/containers/merge_maps.h +++ b/lib/utils/include/utils/containers/merge_maps.h @@ -3,30 +3,64 @@ #include "utils/containers/are_disjoint.h" #include "utils/containers/keys.h" +#include "utils/containers/merge_method.dtg.h" #include "utils/exception.h" #include "utils/fmt/unordered_map.h" +#include "utils/fmt/unordered_set.h" #include namespace FlexFlow { template -std::unordered_map merge_maps(std::unordered_map const &lhs, - std::unordered_map const &rhs) { - if (!are_disjoint(keys(lhs), keys(rhs))) { - throw mk_runtime_error(fmt::format("Key sets of merge_maps parameters are " - "non-disjoint: lhs = {}, rhs = {}", - lhs, - rhs)); +void merge_in_map(std::unordered_map const &m, + std::unordered_map &result) { + for (auto const &[k, v] : m) { + auto it = result.find(k); + if (it != result.end()) { + it->second = v; + } else { + result.insert({k, v}); + } } +} - std::unordered_map result; - for (auto const &kv : lhs) { - result.insert(kv); - } - for (auto const &kv : rhs) { - result.insert(kv); +template +std::unordered_map + merge_disjoint_maps(std::unordered_map const &lhs, + std::unordered_map const &rhs) { + + std::unordered_set lhs_keys = keys(lhs); + std::unordered_set rhs_keys = keys(rhs); + std::unordered_set shared_keys = intersection(lhs_keys, rhs_keys); + if (!shared_keys.empty()) { + throw mk_runtime_error( + fmt::format("merge_maps expected disjoint maps, but maps share keys {}", + shared_keys)); } + std::unordered_map result; + merge_in_map(lhs, result); + merge_in_map(rhs, result); + return result; +} + +template +std::unordered_map + merge_map_left_dominates(std::unordered_map const &lhs, + std::unordered_map const &rhs) { + std::unordered_map result; + merge_in_map(rhs, result); + merge_in_map(lhs, result); + return result; +} + +template +std::unordered_map + merge_map_right_dominates(std::unordered_map const &lhs, + std::unordered_map const &rhs) { + std::unordered_map result; + merge_in_map(lhs, result); + merge_in_map(rhs, result); return result; } diff --git a/lib/utils/include/utils/containers/merge_method.enum.toml b/lib/utils/include/utils/containers/merge_method.enum.toml new file mode 100644 index 0000000000..ec0ed067dd --- /dev/null +++ b/lib/utils/include/utils/containers/merge_method.enum.toml @@ -0,0 +1,17 @@ +namespace = "FlexFlow" +name = "MergeMethod" +features = [ + "json", + "hash", + "fmt", + "rapidcheck", +] + +[[values]] +name = "REQUIRE_DISJOINT" + +[[values]] +name = "LEFT_DOMINATES" + +[[values]] +name = "RIGHT_DOMINATES" diff --git a/lib/utils/include/utils/containers/product.h b/lib/utils/include/utils/containers/product.h index af04edcb81..30aac2681a 100644 --- a/lib/utils/include/utils/containers/product.h +++ b/lib/utils/include/utils/containers/product.h @@ -10,7 +10,7 @@ namespace FlexFlow { **/ template Element product(Container const &container) { - Element result = 1; + Element result = Element{1}; for (Element const &element : container) { result *= element; } diff --git a/lib/utils/include/utils/containers/repeat.h b/lib/utils/include/utils/containers/repeat.h index 18de92cf4a..9782d6265a 100644 --- a/lib/utils/include/utils/containers/repeat.h +++ b/lib/utils/include/utils/containers/repeat.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPEAT_H #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPEAT_H +#include "utils/nonnegative_int/nonnegative_int.h" #include #include #include @@ -8,9 +9,7 @@ namespace FlexFlow { template > -std::vector repeat(int n, F const &f) { - assert(n >= 0); - +std::vector repeat(nonnegative_int n, F const &f) { std::vector result; for (int i = 0; i < n; i++) { result.push_back(f()); diff --git a/lib/utils/include/utils/containers/repeat_element.h b/lib/utils/include/utils/containers/repeat_element.h new file mode 100644 index 0000000000..e1ac508116 --- /dev/null +++ b/lib/utils/include/utils/containers/repeat_element.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPLICATE_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPLICATE_H + +#include "utils/exception.h" +#include "utils/nonnegative_int/nonnegative_int.h" +#include +#include + +namespace FlexFlow { + +template +std::vector repeat_element(nonnegative_int num_times, T const &element) { + std::vector result; + for (int i = 0; i < num_times; ++i) { + result.push_back(element); + } + return result; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/containers/replicate.h b/lib/utils/include/utils/containers/replicate.h deleted file mode 100644 index aa3d0a7e35..0000000000 --- a/lib/utils/include/utils/containers/replicate.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPLICATE_H -#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPLICATE_H - -#include - -namespace FlexFlow { - -template -std::vector replicate(int n, T const &element) { - return std::vector(n, element); -} - -} // namespace FlexFlow - -#endif diff --git a/lib/utils/include/utils/containers/sum.h b/lib/utils/include/utils/containers/sum.h index 135e704045..d6061e396e 100644 --- a/lib/utils/include/utils/containers/sum.h +++ b/lib/utils/include/utils/containers/sum.h @@ -8,7 +8,7 @@ namespace FlexFlow { **/ template Element sum(Container const &container) { - Element result = 0; + Element result = Element{0}; for (Element const &element : container) { result += element; } diff --git a/lib/utils/include/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h b/lib/utils/include/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h new file mode 100644 index 0000000000..b12e20124f --- /dev/null +++ b/lib/utils/include/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h @@ -0,0 +1,34 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_VIEW_AS_OPEN_DATAFLOW_GRAPH_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_VIEW_AS_OPEN_DATAFLOW_GRAPH_H + +#include "utils/graph/dataflow_graph/dataflow_graph_view.h" +#include "utils/graph/open_dataflow_graph/open_dataflow_graph_view.h" + +namespace FlexFlow { + +struct ViewDataflowGraphAsOpenDataflowGraph final + : public IOpenDataflowGraphView { + + ViewDataflowGraphAsOpenDataflowGraph() = delete; + ViewDataflowGraphAsOpenDataflowGraph(DataflowGraphView const &); + + std::unordered_set query_nodes(NodeQuery const &) const override; + std::unordered_set + query_outputs(DataflowOutputQuery const &) const override; + std::unordered_set get_inputs() const override; + std::unordered_set + query_edges(OpenDataflowEdgeQuery const &) const override; + + ViewDataflowGraphAsOpenDataflowGraph *clone() const override; + + virtual ~ViewDataflowGraphAsOpenDataflowGraph() = default; + +private: + DataflowGraphView g; +}; + +OpenDataflowGraphView view_as_open_dataflow_graph(DataflowGraphView const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/graph/dataflow_graph/dataflow_edge_query.struct.toml b/lib/utils/include/utils/graph/dataflow_graph/dataflow_edge_query.struct.toml index 0b0c5a41d8..aed0c28aeb 100644 --- a/lib/utils/include/utils/graph/dataflow_graph/dataflow_edge_query.struct.toml +++ b/lib/utils/include/utils/graph/dataflow_graph/dataflow_edge_query.struct.toml @@ -10,6 +10,7 @@ features = [ includes = [ "utils/graph/query_set.h", "utils/graph/node/node.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] @@ -18,7 +19,7 @@ type = "::FlexFlow::query_set<::FlexFlow::Node>" [[fields]] name = "src_idxs" -type = "::FlexFlow::query_set" +type = "::FlexFlow::query_set<::FlexFlow::nonnegative_int>" [[fields]] name = "dst_nodes" @@ -26,4 +27,4 @@ type = "::FlexFlow::query_set<::FlexFlow::Node>" [[fields]] name = "dst_idxs" -type = "::FlexFlow::query_set" +type = "::FlexFlow::query_set<::FlexFlow::nonnegative_int>" diff --git a/lib/utils/include/utils/graph/dataflow_graph/dataflow_graph.h b/lib/utils/include/utils/graph/dataflow_graph/dataflow_graph.h index 6a1898dd13..58c28aaff6 100644 --- a/lib/utils/include/utils/graph/dataflow_graph/dataflow_graph.h +++ b/lib/utils/include/utils/graph/dataflow_graph/dataflow_graph.h @@ -4,13 +4,14 @@ #include "utils/graph/dataflow_graph/dataflow_graph_view.h" #include "utils/graph/dataflow_graph/i_dataflow_graph.h" #include "utils/graph/dataflow_graph/node_added_result.dtg.h" +#include "utils/nonnegative_int/nonnegative_int.h" namespace FlexFlow { struct DataflowGraph : virtual public DataflowGraphView { public: NodeAddedResult add_node(std::vector const &inputs, - int num_outputs); + nonnegative_int num_outputs); void add_node_unsafe(Node const &node, std::vector const &inputs, diff --git a/lib/utils/include/utils/graph/dataflow_graph/dataflow_input.struct.toml b/lib/utils/include/utils/graph/dataflow_graph/dataflow_input.struct.toml index f322fa63fe..eb9c30d558 100644 --- a/lib/utils/include/utils/graph/dataflow_graph/dataflow_input.struct.toml +++ b/lib/utils/include/utils/graph/dataflow_graph/dataflow_input.struct.toml @@ -9,6 +9,7 @@ features = [ includes = [ "utils/graph/node/node.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] @@ -17,4 +18,4 @@ type = "::FlexFlow::Node" [[fields]] name = "idx" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/utils/include/utils/graph/dataflow_graph/dataflow_output.struct.toml b/lib/utils/include/utils/graph/dataflow_graph/dataflow_output.struct.toml index f3ccebe046..19d92a3d4c 100644 --- a/lib/utils/include/utils/graph/dataflow_graph/dataflow_output.struct.toml +++ b/lib/utils/include/utils/graph/dataflow_graph/dataflow_output.struct.toml @@ -9,6 +9,7 @@ features = [ includes = [ "utils/graph/node/node.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] @@ -17,4 +18,4 @@ type = "::FlexFlow::Node" [[fields]] name = "idx" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/utils/include/utils/graph/dataflow_graph/dataflow_output_query.struct.toml b/lib/utils/include/utils/graph/dataflow_graph/dataflow_output_query.struct.toml index 0701855ba6..d1af6d5c0d 100644 --- a/lib/utils/include/utils/graph/dataflow_graph/dataflow_output_query.struct.toml +++ b/lib/utils/include/utils/graph/dataflow_graph/dataflow_output_query.struct.toml @@ -10,6 +10,10 @@ features = [ includes = [ "utils/graph/query_set.h", "utils/graph/node/node.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", +] + +src_includes = [ "utils/fmt/unordered_set.h", ] @@ -19,4 +23,4 @@ type = "::FlexFlow::query_set<::FlexFlow::Node>" [[fields]] name = "output_idxs" -type = "::FlexFlow::query_set" +type = "::FlexFlow::query_set<::FlexFlow::nonnegative_int>" diff --git a/lib/utils/include/utils/graph/dataflow_graph/i_dataflow_graph.h b/lib/utils/include/utils/graph/dataflow_graph/i_dataflow_graph.h index 87882a6242..2572fe5c68 100644 --- a/lib/utils/include/utils/graph/dataflow_graph/i_dataflow_graph.h +++ b/lib/utils/include/utils/graph/dataflow_graph/i_dataflow_graph.h @@ -9,7 +9,7 @@ namespace FlexFlow { struct IDataflowGraph : virtual public IDataflowGraphView { virtual NodeAddedResult add_node(std::vector const &inputs, - int num_outputs) = 0; + nonnegative_int num_outputs) = 0; virtual void add_node_unsafe(Node const &node, std::vector const &inputs, diff --git a/lib/utils/include/utils/graph/instances/unordered_set_dataflow_graph.h b/lib/utils/include/utils/graph/instances/unordered_set_dataflow_graph.h index 4ed83834a2..ecba7921af 100644 --- a/lib/utils/include/utils/graph/instances/unordered_set_dataflow_graph.h +++ b/lib/utils/include/utils/graph/instances/unordered_set_dataflow_graph.h @@ -14,9 +14,9 @@ struct UnorderedSetDataflowGraph final : virtual public IDataflowGraph, UnorderedSetDataflowGraph(); NodeAddedResult add_node(std::vector const &inputs, - int num_outputs) override; + nonnegative_int num_outputs) override; NodeAddedResult add_node(std::vector const &inputs, - int num_outputs) override; + nonnegative_int num_outputs) override; DataflowGraphInput add_input() override; std::unordered_set query_nodes(NodeQuery const &) const override; diff --git a/lib/utils/include/utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h b/lib/utils/include/utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h index f1063c1f21..159778bb6d 100644 --- a/lib/utils/include/utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h +++ b/lib/utils/include/utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h @@ -57,9 +57,10 @@ struct UnorderedSetLabelledOpenDataflowGraph final } std::vector new_outputs = - transform(count(output_labels.size()), [&](int output_idx) { - return DataflowOutput{new_node, output_idx}; - }); + transform(nonnegative_range(num_elements(output_labels)), + [&](nonnegative_int output_idx) { + return DataflowOutput{new_node, output_idx}; + }); for (auto const &[output, output_label] : zip(new_outputs, output_labels)) { this->values.insert({OpenDataflowValue{output}, output_label}); diff --git a/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/get_graph_data.h b/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/get_graph_data.h index ec8f025ac3..2115a03cda 100644 --- a/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/get_graph_data.h +++ b/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/get_graph_data.h @@ -4,6 +4,7 @@ #include "utils/graph/labelled_open_dataflow_graph/algorithms/labelled_open_dataflow_graph_data.dtg.h" #include "utils/graph/labelled_open_dataflow_graph/labelled_open_dataflow_graph_view.h" #include "utils/graph/node/algorithms.h" +#include "utils/graph/open_dataflow_graph/algorithms/get_edges.h" #include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_values.h" namespace FlexFlow { diff --git a/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/permute_node_ids.h b/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/permute_node_ids.h index 2d1dd03755..88950635d2 100644 --- a/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/permute_node_ids.h +++ b/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/permute_node_ids.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_LABELLED_OPEN_DATAFLOW_GRAPH_ALGORITHMS_PERMUTE_NODE_IDS_H #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_LABELLED_OPEN_DATAFLOW_GRAPH_ALGORITHMS_PERMUTE_NODE_IDS_H +#include "utils/containers/generate_map.h" #include "utils/graph/labelled_open_dataflow_graph/algorithms/with_labelling.h" #include "utils/graph/labelled_open_dataflow_graph/labelled_open_dataflow_graph_view.h" #include "utils/graph/node/algorithms.h" diff --git a/lib/utils/include/utils/graph/multidigraph/algorithms/add_nodes.h b/lib/utils/include/utils/graph/multidigraph/algorithms/add_nodes.h index 737f2d0d23..80d0ca3eaf 100644 --- a/lib/utils/include/utils/graph/multidigraph/algorithms/add_nodes.h +++ b/lib/utils/include/utils/graph/multidigraph/algorithms/add_nodes.h @@ -2,10 +2,11 @@ #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_MULTIDIGRAPH_ALGORITHMS_ADD_NODES_H #include "utils/graph/multidigraph/multidigraph.h" +#include "utils/nonnegative_int/nonnegative_int.h" namespace FlexFlow { -std::vector add_nodes(MultiDiGraph &, int num_nodes); +std::vector add_nodes(MultiDiGraph &, nonnegative_int num_nodes); } // namespace FlexFlow diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h new file mode 100644 index 0000000000..ae99e2850f --- /dev/null +++ b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h @@ -0,0 +1,13 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_ARE_ISOMORPHIC_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_ARE_ISOMORPHIC_H + +#include "utils/graph/open_dataflow_graph/open_dataflow_graph_view.h" + +namespace FlexFlow { + +bool are_isomorphic(OpenDataflowGraphView const &, + OpenDataflowGraphView const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h new file mode 100644 index 0000000000..fe282a8c2e --- /dev/null +++ b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h @@ -0,0 +1,21 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_OPEN_DATAFLOW_GRAPH_ISOMORPHISM_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_OPEN_DATAFLOW_GRAPH_ISOMORPHISM_H + +#include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.dtg.h" +#include "utils/graph/open_dataflow_graph/open_dataflow_value.dtg.h" + +namespace FlexFlow { + +OpenDataflowValue isomorphism_map_r_open_dataflow_value_from_l( + OpenDataflowGraphIsomorphism const &iso, OpenDataflowValue const &l_value); +OpenDataflowValue isomorphism_map_l_open_dataflow_value_from_r( + OpenDataflowGraphIsomorphism const &iso, OpenDataflowValue const &r_value); + +DataflowOutput isomorphism_map_r_dataflow_output_from_l( + OpenDataflowGraphIsomorphism const &iso, DataflowOutput const &l_output); +DataflowOutput isomorphism_map_l_dataflow_output_from_r( + OpenDataflowGraphIsomorphism const &iso, DataflowOutput const &r_output); + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/dataflow_input_edge_query.struct.toml b/lib/utils/include/utils/graph/open_dataflow_graph/dataflow_input_edge_query.struct.toml index 544a05af85..f67e8b88e0 100644 --- a/lib/utils/include/utils/graph/open_dataflow_graph/dataflow_input_edge_query.struct.toml +++ b/lib/utils/include/utils/graph/open_dataflow_graph/dataflow_input_edge_query.struct.toml @@ -11,6 +11,7 @@ includes = [ "utils/graph/query_set.h", "utils/graph/open_dataflow_graph/dataflow_graph_input.dtg.h", "utils/graph/node/node.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] @@ -23,4 +24,4 @@ type = "::FlexFlow::query_set<::FlexFlow::Node>" [[fields]] name = "dst_idxs" -type = "::FlexFlow::query_set" +type = "::FlexFlow::query_set<::FlexFlow::nonnegative_int>" diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/i_open_dataflow_graph.h b/lib/utils/include/utils/graph/open_dataflow_graph/i_open_dataflow_graph.h index 6edfa408d4..9b71b06e62 100644 --- a/lib/utils/include/utils/graph/open_dataflow_graph/i_open_dataflow_graph.h +++ b/lib/utils/include/utils/graph/open_dataflow_graph/i_open_dataflow_graph.h @@ -9,7 +9,7 @@ namespace FlexFlow { struct IOpenDataflowGraph : virtual public IOpenDataflowGraphView { virtual NodeAddedResult add_node(std::vector const &inputs, - int num_outputs) = 0; + nonnegative_int num_outputs) = 0; virtual DataflowGraphInput add_input() = 0; virtual IOpenDataflowGraph *clone() const = 0; diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_edge.h b/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_edge.h index 09499f8e5f..1102bf0586 100644 --- a/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_edge.h +++ b/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_edge.h @@ -7,7 +7,7 @@ namespace FlexFlow { Node get_open_dataflow_edge_dst_node(OpenDataflowEdge const &); -int get_open_dataflow_edge_dst_idx(OpenDataflowEdge const &); +nonnegative_int get_open_dataflow_edge_dst_idx(OpenDataflowEdge const &); DataflowInput get_open_dataflow_edge_dst(OpenDataflowEdge const &); OpenDataflowValue get_open_dataflow_edge_src(OpenDataflowEdge const &); OpenDataflowEdge diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_graph.h b/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_graph.h index e8ecce76e8..9d48020d5f 100644 --- a/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_graph.h +++ b/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_graph.h @@ -11,7 +11,7 @@ namespace FlexFlow { struct OpenDataflowGraph : virtual public OpenDataflowGraphView { public: NodeAddedResult add_node(std::vector const &inputs, - int num_outputs); + nonnegative_int num_outputs); DataflowGraphInput add_input(); template diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.h b/lib/utils/include/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.h index 7b921772d6..f3d54e4329 100644 --- a/lib/utils/include/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.h +++ b/lib/utils/include/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.h @@ -12,7 +12,7 @@ struct UnorderedSetOpenDataflowGraph : public IOpenDataflowGraph { UnorderedSetOpenDataflowGraph(); NodeAddedResult add_node(std::vector const &inputs, - int num_outputs) override; + nonnegative_int num_outputs) override; std::unordered_set query_nodes(NodeQuery const &) const override; std::unordered_set diff --git a/lib/utils/include/utils/graph/render_dot.h b/lib/utils/include/utils/graph/render_dot.h new file mode 100644 index 0000000000..632ba736ea --- /dev/null +++ b/lib/utils/include/utils/graph/render_dot.h @@ -0,0 +1,19 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_RENDER_DOT_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_RENDER_DOT_H + +#include "utils/graph/labelled_open_dataflow_graph/labelled_open_dataflow_graph_view.h" +#include +#include + +namespace FlexFlow { + +std::string escape_dot_string(std::string const &); +std::string render_dot_node_attrs( + std::unordered_map const &attrs); +std::string render_dot( + LabelledDataflowGraphView, + std::string> const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/nonnegative_int/ceildiv.h b/lib/utils/include/utils/nonnegative_int/ceildiv.h new file mode 100644 index 0000000000..939ea3de51 --- /dev/null +++ b/lib/utils/include/utils/nonnegative_int/ceildiv.h @@ -0,0 +1,11 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_CEILDIV_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_CEILDIV_H + +#include "utils/nonnegative_int/nonnegative_int.h" +namespace FlexFlow { + +nonnegative_int ceildiv(nonnegative_int numerator, nonnegative_int denominator); + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h index 0749497c56..0bcc8cfd6f 100644 --- a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h +++ b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h @@ -1,12 +1,11 @@ #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_INT_H #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_INT_H -#include "rapidcheck.h" - #include #include #include #include +#include #include namespace FlexFlow { @@ -14,6 +13,7 @@ class nonnegative_int { public: nonnegative_int() = delete; explicit nonnegative_int(int value); + explicit nonnegative_int(size_t value); explicit operator int() const noexcept; @@ -39,16 +39,31 @@ class nonnegative_int { friend bool operator>=(int const &lhs, nonnegative_int const &rhs); nonnegative_int operator+(nonnegative_int const &other) const; + nonnegative_int &operator++(); + nonnegative_int operator++(int); + nonnegative_int &operator+=(nonnegative_int const &other); + + nonnegative_int operator*(nonnegative_int const &other) const; + nonnegative_int &operator*=(nonnegative_int const &other); + + nonnegative_int operator/(nonnegative_int const &other) const; + nonnegative_int &operator/=(nonnegative_int const &other); + + nonnegative_int operator%(nonnegative_int const &other) const; + nonnegative_int &operator%=(nonnegative_int const &other); friend std::ostream &operator<<(std::ostream &os, nonnegative_int const &n); friend int format_as(nonnegative_int const &); - int get_value() const; + int unwrap_nonnegative() const; private: int value_; }; + +nonnegative_int operator""_n(unsigned long long int); + } // namespace FlexFlow namespace nlohmann { @@ -59,6 +74,13 @@ struct adl_serializer<::FlexFlow::nonnegative_int> { }; } // namespace nlohmann +namespace rc { +template <> +struct Arbitrary<::FlexFlow::nonnegative_int> { + static Gen<::FlexFlow::nonnegative_int> arbitrary(); +}; +} // namespace rc + namespace std { template <> struct hash<::FlexFlow::nonnegative_int> { diff --git a/lib/utils/include/utils/nonnegative_int/nonnegative_range.h b/lib/utils/include/utils/nonnegative_int/nonnegative_range.h new file mode 100644 index 0000000000..af323aef42 --- /dev/null +++ b/lib/utils/include/utils/nonnegative_int/nonnegative_range.h @@ -0,0 +1,14 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_RANGE_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_RANGE_H + +#include "utils/nonnegative_int/nonnegative_int.h" + +namespace FlexFlow { + +std::vector nonnegative_range(nonnegative_int end); +std::vector + nonnegative_range(nonnegative_int start, nonnegative_int end, int step = 1); + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/nonnegative_int/num_elements.h b/lib/utils/include/utils/nonnegative_int/num_elements.h new file mode 100644 index 0000000000..57bc98ee50 --- /dev/null +++ b/lib/utils/include/utils/nonnegative_int/num_elements.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NUM_ELEMENTS_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NUM_ELEMENTS_H + +#include "utils/exception.h" +#include "utils/nonnegative_int/nonnegative_int.h" + +namespace FlexFlow { + +template +nonnegative_int num_elements(T const &t) { + size_t t_size = t.size(); + return nonnegative_int{t_size}; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/variant.h b/lib/utils/include/utils/variant.h index 241d631200..75a8851362 100644 --- a/lib/utils/include/utils/variant.h +++ b/lib/utils/include/utils/variant.h @@ -4,6 +4,7 @@ #include "rapidcheck.h" #include "utils/type_traits.h" #include +#include #include namespace FlexFlow { diff --git a/lib/utils/src/utils/bidict/algorithms/bidict_from_enumerating.cc b/lib/utils/src/utils/bidict/algorithms/bidict_from_enumerating.cc index 350f08600c..67e0b32d6e 100644 --- a/lib/utils/src/utils/bidict/algorithms/bidict_from_enumerating.cc +++ b/lib/utils/src/utils/bidict/algorithms/bidict_from_enumerating.cc @@ -1 +1,14 @@ #include "utils/bidict/algorithms/bidict_from_enumerating.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template bidict + bidict_from_enumerating(std::unordered_set const &); + +template bidict + bidict_from_enumerating(std::set const &); + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/cli/cli_parse.cc b/lib/utils/src/utils/cli/cli_parse.cc index 07982c0c2d..36d5837f9c 100644 --- a/lib/utils/src/utils/cli/cli_parse.cc +++ b/lib/utils/src/utils/cli/cli_parse.cc @@ -32,7 +32,7 @@ tl::expected {}, }; - int consumed_positional_args = 0; + nonnegative_int consumed_positional_args = 0_n; auto parse_positional_arg = [&](std::string const &arg) -> std::optional { if (consumed_positional_args >= cli.positional_arguments.size()) { @@ -40,8 +40,8 @@ tl::expected cli.positional_arguments.size()); } - CLIPositionalArgumentSpec arg_spec = - cli.positional_arguments.at(consumed_positional_args); + CLIPositionalArgumentSpec arg_spec = cli.positional_arguments.at( + consumed_positional_args.unwrap_nonnegative()); if (arg_spec.choices.has_value() && !contains(arg_spec.choices.value(), arg)) { diff --git a/lib/utils/src/utils/cli/cli_spec.cc b/lib/utils/src/utils/cli/cli_spec.cc index ca51cfe57f..e314f6fd55 100644 --- a/lib/utils/src/utils/cli/cli_spec.cc +++ b/lib/utils/src/utils/cli/cli_spec.cc @@ -2,6 +2,8 @@ #include "utils/containers/count.h" #include "utils/containers/transform.h" #include "utils/integer_conversions.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { @@ -10,8 +12,8 @@ CLISpec empty_cli_spec() { } std::vector cli_get_flag_keys(CLISpec const &cli) { - return transform(count(cli.flags.size()), - [](int idx) { return CLIFlagKey{idx}; }); + return transform(nonnegative_range(num_elements(cli.flags)), + [](nonnegative_int idx) { return CLIFlagKey{idx}; }); } CLIArgumentKey cli_add_help_flag(CLISpec &cli) { @@ -21,17 +23,18 @@ CLIArgumentKey cli_add_help_flag(CLISpec &cli) { } CLIArgumentKey cli_add_flag(CLISpec &cli, CLIFlagSpec const &flag_spec) { + CLIArgumentKey key = CLIArgumentKey{CLIFlagKey{num_elements(cli.flags)}}; cli.flags.push_back(flag_spec); - - return CLIArgumentKey{CLIFlagKey{int_from_size_t(cli.flags.size()) - 1}}; + return key; } CLIArgumentKey cli_add_positional_argument(CLISpec &cli, CLIPositionalArgumentSpec const &arg) { + CLIArgumentKey key = CLIArgumentKey{ + CLIPositionalArgumentKey{num_elements(cli.positional_arguments)}}; cli.positional_arguments.push_back(arg); - return CLIArgumentKey{CLIPositionalArgumentKey{ - int_from_size_t(cli.positional_arguments.size()) - 1}}; + return key; } } // namespace FlexFlow diff --git a/lib/utils/src/utils/containers/at_idx.cc b/lib/utils/src/utils/containers/at_idx.cc index 45b1a31fce..14a0695c6d 100644 --- a/lib/utils/src/utils/containers/at_idx.cc +++ b/lib/utils/src/utils/containers/at_idx.cc @@ -1 +1,10 @@ #include "utils/containers/at_idx.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using E = value_type<0>; + +template std::optional at_idx(std::vector const &, nonnegative_int); + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/containers/enumerate.cc b/lib/utils/src/utils/containers/enumerate.cc index 0984b6dc63..ca5ad6ddc1 100644 --- a/lib/utils/src/utils/containers/enumerate.cc +++ b/lib/utils/src/utils/containers/enumerate.cc @@ -1 +1,12 @@ #include "utils/containers/enumerate.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template std::map enumerate(std::vector const &); + +template std::map enumerate(std::unordered_set const &); + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/containers/enumerate_vector.cc b/lib/utils/src/utils/containers/enumerate_vector.cc index d4fd131af2..0d0bd1c277 100644 --- a/lib/utils/src/utils/containers/enumerate_vector.cc +++ b/lib/utils/src/utils/containers/enumerate_vector.cc @@ -1 +1,10 @@ #include "utils/containers/enumerate_vector.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template std::map enumerate_vector(std::vector const &); + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/containers/make.cc b/lib/utils/src/utils/containers/make.cc new file mode 100644 index 0000000000..29b5bc5184 --- /dev/null +++ b/lib/utils/src/utils/containers/make.cc @@ -0,0 +1,8 @@ +#include "utils/containers/make.h" +#include + +namespace FlexFlow { + +template decltype(auto) make>(); + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/containers/range.cc b/lib/utils/src/utils/containers/range.cc index d3ebd1063b..f3baab3db1 100644 --- a/lib/utils/src/utils/containers/range.cc +++ b/lib/utils/src/utils/containers/range.cc @@ -1,5 +1,6 @@ #include "utils/containers/range.h" #include +#include namespace FlexFlow { diff --git a/lib/utils/src/utils/containers/repeat.cc b/lib/utils/src/utils/containers/repeat.cc index 76e46f0fdc..777996d995 100644 --- a/lib/utils/src/utils/containers/repeat.cc +++ b/lib/utils/src/utils/containers/repeat.cc @@ -1 +1,11 @@ #include "utils/containers/repeat.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using Out = value_type<0>; +using F = std::function; + +template std::vector repeat(nonnegative_int, F const &); + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/containers/repeat_element.cc b/lib/utils/src/utils/containers/repeat_element.cc new file mode 100644 index 0000000000..70889eb971 --- /dev/null +++ b/lib/utils/src/utils/containers/repeat_element.cc @@ -0,0 +1,10 @@ +#include "utils/containers/repeat_element.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template std::vector repeat_element(nonnegative_int, T const &); + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/containers/replicate.cc b/lib/utils/src/utils/containers/replicate.cc deleted file mode 100644 index 2fb2f079f6..0000000000 --- a/lib/utils/src/utils/containers/replicate.cc +++ /dev/null @@ -1 +0,0 @@ -#include "utils/containers/replicate.h" diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms.cc index f0e52d6fc2..7069146057 100644 --- a/lib/utils/src/utils/graph/dataflow_graph/algorithms.cc +++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms.cc @@ -27,7 +27,7 @@ std::vector get_outputs(DataflowGraphView const &g, Node const &n) { return sorted_by(g.query_outputs(DataflowOutputQuery{ query_set{n}, - query_set::matchall(), + query_set::matchall(), }), [](DataflowOutput const &l, DataflowOutput const &r) { return l.idx < r.idx; diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/as_dot.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/as_dot.cc index 47c30ce998..2ae903fa0b 100644 --- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/as_dot.cc +++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/as_dot.cc @@ -1,27 +1,36 @@ #include "utils/graph/dataflow_graph/algorithms/as_dot.h" +#include "utils/containers/generate_map.h" +#include "utils/containers/map_keys.h" #include "utils/dot_file.h" #include "utils/graph/dataflow_graph/algorithms.h" +#include "utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h" +#include "utils/graph/labelled_open_dataflow_graph/algorithms/with_labelling.h" #include "utils/graph/node/algorithms.h" +#include "utils/graph/render_dot.h" #include "utils/record_formatter.h" namespace FlexFlow { -// WARN(@lockshaw): doing this all with string ids is ugly and error prone, -// as it requires duplicating the stringification logic across functions. -// -// Fixing this is tracked in issue std::string as_dot(DataflowGraphView const &g) { - std::ostringstream oss; - DotFile dot = DotFile{oss}; + auto get_node_attrs = [](Node const &) { + return std::unordered_map{}; + }; + + std::unordered_map> + node_labels = generate_map(get_nodes(g), get_node_attrs); - std::function get_node_label = - [](Node const &n) -> std::string { - return fmt::format("n{}", n.raw_uid); + auto get_output_label = [](DataflowOutput const &o) { + return fmt::to_string(o.idx); }; - as_dot(dot, g, get_node_label); - dot.close(); - return oss.str(); + std::unordered_map output_labels = + generate_map(get_all_dataflow_outputs(g), get_output_label); + std::unordered_map value_labels = + map_keys(output_labels, + [](DataflowOutput const &o) { return OpenDataflowValue{o}; }); + + return render_dot(with_labelling( + view_as_open_dataflow_graph(g), node_labels, value_labels)); } void as_dot(DotFile &dot, @@ -29,9 +38,13 @@ void as_dot(DotFile &dot, std::function const &get_node_label) { auto get_node_name = [](Node n) { return fmt::format("n{}", n.raw_uid); }; - auto get_input_field = [](int idx) { return fmt::format("i{}", idx); }; + auto get_input_field = [](nonnegative_int idx) { + return fmt::format("i{}", idx); + }; - auto get_output_field = [](int idx) { return fmt::format("o{}", idx); }; + auto get_output_field = [](nonnegative_int idx) { + return fmt::format("o{}", idx); + }; for (Node const &n : get_nodes(g)) { std::vector n_inputs = get_dataflow_inputs(g, n); diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc index c07d344d05..73afc11acc 100644 --- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc +++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc @@ -6,9 +6,9 @@ std::unordered_set get_dataflow_edges_from_node_to_node( DataflowGraphView const &g, Node const &src, Node const &dst) { return g.query_edges(DataflowEdgeQuery{ /*src_nodes=*/query_set{src}, - /*src_idxs=*/query_set::matchall(), + /*src_idxs=*/query_set::matchall(), /*dst_nodes=*/query_set{dst}, - /*dst_idxs=*/query_set::matchall(), + /*dst_idxs=*/query_set::matchall(), }); } diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc index 9500836db1..c4947f967a 100644 --- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc +++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc @@ -7,9 +7,9 @@ std::vector get_incoming_edges(DataflowGraphView const &g, Node const &n) { return sorted_by(g.query_edges(DataflowEdgeQuery{ query_set::matchall(), - query_set::matchall(), + query_set::matchall(), {n}, - query_set::matchall(), + query_set::matchall(), }), [](DataflowEdge const &l, DataflowEdge const &r) { return l.dst.idx < r.dst.idx; @@ -21,9 +21,9 @@ std::unordered_set std::unordered_set const &ns) { DataflowEdgeQuery query = DataflowEdgeQuery{ query_set::matchall(), - query_set::matchall(), + query_set::matchall(), query_set{ns}, - query_set::matchall(), + query_set::matchall(), }; return g.query_edges(query); } diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc index 2376e4897f..16b2b82b2d 100644 --- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc +++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc @@ -7,9 +7,9 @@ std::unordered_set get_outgoing_edges(DataflowGraphView const &g, Node const &n) { return g.query_edges(DataflowEdgeQuery{ {n}, - query_set::matchall(), + query_set::matchall(), query_set::matchall(), - query_set::matchall(), + query_set::matchall(), }); } @@ -18,9 +18,9 @@ std::unordered_set std::unordered_set const &ns) { DataflowEdgeQuery query = DataflowEdgeQuery{ query_set{ns}, - query_set::matchall(), + query_set::matchall(), query_set::matchall(), - query_set::matchall(), + query_set::matchall(), }; return g.query_edges(query); } diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc index d17a84dd12..a06ec1ab31 100644 --- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc +++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc @@ -13,9 +13,9 @@ std::unordered_set DataflowEdgeQuery query = DataflowEdgeQuery{ src_query, - query_set::matchall(), + query_set::matchall(), query_set{ns}, - query_set::matchall(), + query_set::matchall(), }; return g.query_edges(query); diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc index c442a26dab..f94dd94e11 100644 --- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc +++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc @@ -13,9 +13,9 @@ std::unordered_set DataflowEdgeQuery query = DataflowEdgeQuery{ query_set{ns}, - query_set::matchall(), + query_set::matchall(), dst_query, - query_set::matchall(), + query_set::matchall(), }; return g.query_edges(query); diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.cc index 0fd0b85b71..703db4bf91 100644 --- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.cc +++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.cc @@ -3,16 +3,18 @@ namespace FlexFlow { -ViewDataflowGraphAsOpen::ViewDataflowGraphAsOpen(DataflowGraphView const &g) +ViewDataflowGraphAsOpenDataflowGraph::ViewDataflowGraphAsOpenDataflowGraph( + DataflowGraphView const &g) : g(g) {} -std::unordered_set - ViewDataflowGraphAsOpen::query_nodes(NodeQuery const &q) const { +std::unordered_set ViewDataflowGraphAsOpenDataflowGraph::query_nodes( + NodeQuery const &q) const { return this->g.query_nodes(q); } std::unordered_set - ViewDataflowGraphAsOpen::query_edges(OpenDataflowEdgeQuery const &q) const { + ViewDataflowGraphAsOpenDataflowGraph::query_edges( + OpenDataflowEdgeQuery const &q) const { std::unordered_set closed_edges = this->g.query_edges(q.standard_edge_query); @@ -21,21 +23,23 @@ std::unordered_set } std::unordered_set - ViewDataflowGraphAsOpen::query_outputs(DataflowOutputQuery const &q) const { + ViewDataflowGraphAsOpenDataflowGraph::query_outputs( + DataflowOutputQuery const &q) const { return this->g.query_outputs(q); } std::unordered_set - ViewDataflowGraphAsOpen::get_inputs() const { + ViewDataflowGraphAsOpenDataflowGraph::get_inputs() const { return {}; } -ViewDataflowGraphAsOpen *ViewDataflowGraphAsOpen::clone() const { - return new ViewDataflowGraphAsOpen{this->g}; +ViewDataflowGraphAsOpenDataflowGraph * + ViewDataflowGraphAsOpenDataflowGraph::clone() const { + return new ViewDataflowGraphAsOpenDataflowGraph{this->g}; } OpenDataflowGraphView view_as_open_dataflow_graph(DataflowGraphView const &g) { - return OpenDataflowGraphView::create(g); + return OpenDataflowGraphView::create(g); } } // namespace FlexFlow diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h b/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h deleted file mode 100644 index bec9d0e019..0000000000 --- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef _FLEXFLOW_LIB_UTILS_SRC_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_VIEW_AS_OPEN_DATAFLOW_GRAPH_H -#define _FLEXFLOW_LIB_UTILS_SRC_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_VIEW_AS_OPEN_DATAFLOW_GRAPH_H - -#include "utils/graph/open_dataflow_graph/open_dataflow_graph_view.h" - -namespace FlexFlow { - -struct ViewDataflowGraphAsOpen final : public IOpenDataflowGraphView { -public: - ViewDataflowGraphAsOpen() = delete; - ViewDataflowGraphAsOpen(DataflowGraphView const &); - - std::unordered_set query_nodes(NodeQuery const &) const override; - std::unordered_set - query_edges(OpenDataflowEdgeQuery const &) const override; - std::unordered_set - query_outputs(DataflowOutputQuery const &) const override; - std::unordered_set get_inputs() const override; - - ViewDataflowGraphAsOpen *clone() const override; - - ~ViewDataflowGraphAsOpen() = default; - -private: - DataflowGraphView g; -}; - -OpenDataflowGraphView view_as_open_dataflow_graph(DataflowGraphView const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/utils/src/utils/graph/dataflow_graph/dataflow_edge_query.cc b/lib/utils/src/utils/graph/dataflow_graph/dataflow_edge_query.cc index 2196f7a028..982969f3a5 100644 --- a/lib/utils/src/utils/graph/dataflow_graph/dataflow_edge_query.cc +++ b/lib/utils/src/utils/graph/dataflow_graph/dataflow_edge_query.cc @@ -5,18 +5,18 @@ namespace FlexFlow { DataflowEdgeQuery dataflow_edge_query_all() { return DataflowEdgeQuery{ query_set::matchall(), - query_set::matchall(), + query_set::matchall(), query_set::matchall(), - query_set::matchall(), + query_set::matchall(), }; } DataflowEdgeQuery dataflow_edge_query_none() { return DataflowEdgeQuery{ query_set::match_none(), - query_set::match_none(), + query_set::match_none(), query_set::match_none(), - query_set::match_none(), + query_set::match_none(), }; } @@ -30,9 +30,9 @@ bool dataflow_edge_query_includes_dataflow_edge(DataflowEdgeQuery const &q, DataflowEdgeQuery dataflow_edge_query_for_edge(DataflowEdge const &e) { return DataflowEdgeQuery{ query_set{e.src.node}, - query_set{e.src.idx}, + query_set{e.src.idx}, query_set{e.dst.node}, - query_set{e.dst.idx}, + query_set{e.dst.idx}, }; } @@ -40,9 +40,9 @@ DataflowEdgeQuery dataflow_edge_query_all_outgoing_from(DataflowOutput const &src) { return DataflowEdgeQuery{ query_set{src.node}, - query_set{src.idx}, + query_set{src.idx}, query_set::matchall(), - query_set::matchall(), + query_set::matchall(), }; } @@ -50,9 +50,9 @@ DataflowEdgeQuery dataflow_edge_query_all_incoming_to(DataflowInput const &dst) { return DataflowEdgeQuery{ query_set::matchall(), - query_set::matchall(), + query_set::matchall(), query_set{dst.node}, - query_set{dst.idx}, + query_set{dst.idx}, }; } diff --git a/lib/utils/src/utils/graph/dataflow_graph/dataflow_graph.cc b/lib/utils/src/utils/graph/dataflow_graph/dataflow_graph.cc index 868dd61c6d..8ed36135e1 100644 --- a/lib/utils/src/utils/graph/dataflow_graph/dataflow_graph.cc +++ b/lib/utils/src/utils/graph/dataflow_graph/dataflow_graph.cc @@ -4,7 +4,7 @@ namespace FlexFlow { NodeAddedResult DataflowGraph::add_node(std::vector const &inputs, - int num_outputs) { + nonnegative_int num_outputs) { return this->get_interface().add_node(inputs, num_outputs); } diff --git a/lib/utils/src/utils/graph/dataflow_graph/dataflow_output_query.cc b/lib/utils/src/utils/graph/dataflow_graph/dataflow_output_query.cc index 64df4c77f2..ceaad2bfdf 100644 --- a/lib/utils/src/utils/graph/dataflow_graph/dataflow_output_query.cc +++ b/lib/utils/src/utils/graph/dataflow_graph/dataflow_output_query.cc @@ -5,14 +5,14 @@ namespace FlexFlow { DataflowOutputQuery dataflow_output_query_all() { return DataflowOutputQuery{ query_set::matchall(), - query_set::matchall(), + query_set::matchall(), }; } DataflowOutputQuery dataflow_output_query_none() { return DataflowOutputQuery{ query_set::match_none(), - query_set::match_none(), + query_set::match_none(), }; } @@ -24,7 +24,7 @@ bool dataflow_output_query_includes_dataflow_output( DataflowOutputQuery dataflow_output_query_for_output(DataflowOutput const &o) { return DataflowOutputQuery{ query_set{o.node}, - query_set{o.idx}, + query_set{o.idx}, }; } diff --git a/lib/utils/src/utils/graph/dataflow_graph/i_dataflow_graph_view.cc b/lib/utils/src/utils/graph/dataflow_graph/i_dataflow_graph_view.cc index 300b5de546..ef9412b939 100644 --- a/lib/utils/src/utils/graph/dataflow_graph/i_dataflow_graph_view.cc +++ b/lib/utils/src/utils/graph/dataflow_graph/i_dataflow_graph_view.cc @@ -7,9 +7,9 @@ std::unordered_set IDataflowGraphView::query_edges(DirectedEdgeQuery const &q) const { DataflowEdgeQuery dataflow_query = DataflowEdgeQuery{ q.srcs, - matchall(), + matchall(), q.dsts, - matchall(), + matchall(), }; std::unordered_set dataflow_edges = this->query_edges(dataflow_query); diff --git a/lib/utils/src/utils/graph/digraph/algorithms/transitive_closure.cc b/lib/utils/src/utils/graph/digraph/algorithms/transitive_closure.cc index 3efea1c138..2de3056068 100644 --- a/lib/utils/src/utils/graph/digraph/algorithms/transitive_closure.cc +++ b/lib/utils/src/utils/graph/digraph/algorithms/transitive_closure.cc @@ -6,6 +6,7 @@ #include "utils/graph/digraph/algorithms/materialize_digraph_view.h" #include "utils/graph/instances/adjacency_digraph.h" #include "utils/graph/node/algorithms.h" +#include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { @@ -15,7 +16,9 @@ DiGraphView transitive_closure(DiGraphView const &g) { // incredibly slow (> minutes) for even moderately sized graphs // (i.e., 200 nodes) without optimization enabled. - bidict nodes = bidict_from_enumerating(get_nodes(g)); + bidict nodes = + map_keys(bidict_from_enumerating(get_nodes(g)), + [](nonnegative_int x) { return x.unwrap_nonnegative(); }); std::unordered_set edges = get_edges(g); int num_nodes = nodes.size(); diff --git a/lib/utils/src/utils/graph/digraph/algorithms/transitive_reduction.cc b/lib/utils/src/utils/graph/digraph/algorithms/transitive_reduction.cc index 97a2439263..69b24b716c 100644 --- a/lib/utils/src/utils/graph/digraph/algorithms/transitive_reduction.cc +++ b/lib/utils/src/utils/graph/digraph/algorithms/transitive_reduction.cc @@ -37,7 +37,9 @@ DiGraphView transitive_reduction(DiGraphView const &g) { // transitive_closure inlined to avoid any drifts in node numbering // between transitive_closure and transitive_reduction - bidict nodes = bidict_from_enumerating(get_nodes(g)); + bidict nodes = + map_keys(bidict_from_enumerating(get_nodes(g)), + [](nonnegative_int x) { return x.unwrap_nonnegative(); }); int num_nodes = nodes.size(); std::vector edge_matrix(num_nodes * num_nodes, false); diff --git a/lib/utils/src/utils/graph/instances/unordered_set_dataflow_graph.cc b/lib/utils/src/utils/graph/instances/unordered_set_dataflow_graph.cc index 1ffc5f423f..a5a1fb82bf 100644 --- a/lib/utils/src/utils/graph/instances/unordered_set_dataflow_graph.cc +++ b/lib/utils/src/utils/graph/instances/unordered_set_dataflow_graph.cc @@ -1,6 +1,5 @@ #include "utils/graph/instances/unordered_set_dataflow_graph.h" #include "utils/containers/are_disjoint.h" -#include "utils/containers/count.h" #include "utils/containers/enumerate_vector.h" #include "utils/containers/extend.h" #include "utils/containers/transform.h" @@ -9,6 +8,7 @@ #include "utils/graph/node/algorithms.h" #include "utils/graph/open_dataflow_graph/open_dataflow_edge.h" #include "utils/graph/open_dataflow_graph/open_dataflow_edge_query.h" +#include "utils/nonnegative_int/nonnegative_range.h" namespace FlexFlow { @@ -25,18 +25,18 @@ UnorderedSetDataflowGraph::UnorderedSetDataflowGraph( } NodeAddedResult UnorderedSetDataflowGraph::add_node( - std::vector const &inputs, int num_outputs) { + std::vector const &inputs, nonnegative_int num_outputs) { std::vector open_inputs = transform( inputs, [](DataflowOutput const &o) { return OpenDataflowValue{o}; }); return this->add_node(open_inputs, num_outputs); } NodeAddedResult UnorderedSetDataflowGraph::add_node( - std::vector const &inputs, int num_outputs) { + std::vector const &inputs, nonnegative_int num_outputs) { Node new_node = this->node_source.new_node(); - std::vector new_outputs = - transform(count(num_outputs), [&](int output_idx) { + std::vector new_outputs = transform( + nonnegative_range(num_outputs), [&](nonnegative_int output_idx) { return DataflowOutput{new_node, output_idx}; }); diff --git a/lib/utils/src/utils/graph/multidigraph/algorithms/add_nodes.cc b/lib/utils/src/utils/graph/multidigraph/algorithms/add_nodes.cc index a404423284..fd4a8782a4 100644 --- a/lib/utils/src/utils/graph/multidigraph/algorithms/add_nodes.cc +++ b/lib/utils/src/utils/graph/multidigraph/algorithms/add_nodes.cc @@ -3,7 +3,7 @@ namespace FlexFlow { -std::vector add_nodes(MultiDiGraph &g, int num_nodes) { +std::vector add_nodes(MultiDiGraph &g, nonnegative_int num_nodes) { return repeat(num_nodes, [&]() { return g.add_node(); }); } diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.cc new file mode 100644 index 0000000000..f7f8a9fd34 --- /dev/null +++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.cc @@ -0,0 +1,11 @@ +#include "utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h" +#include "utils/graph/open_dataflow_graph/algorithms/find_isomorphism.h" + +namespace FlexFlow { + +bool are_isomorphic(OpenDataflowGraphView const &src, + OpenDataflowGraphView const &dst) { + return find_isomorphism(src, dst).has_value(); +} + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc index 9077ea5f9a..261de287a9 100644 --- a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc +++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc @@ -2,13 +2,16 @@ #include "utils/dot_file.h" #include "utils/graph/dataflow_graph/algorithms.h" #include "utils/graph/dataflow_graph/algorithms/as_dot.h" +#include "utils/graph/labelled_dataflow_graph/labelled_dataflow_graph.h" #include "utils/graph/node/algorithms.h" #include "utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.h" +#include "utils/graph/open_dataflow_graph/algorithms/get_inputs.h" #include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.h" namespace FlexFlow { std::string as_dot(OpenDataflowGraphView const &g) { + std::function get_node_label = [](Node const &n) { return fmt::format("n{}", n.raw_uid); }; @@ -36,9 +39,13 @@ std::string auto get_node_name = [](Node n) { return fmt::format("n{}", n.raw_uid); }; - auto get_input_field = [](int idx) { return fmt::format("i{}", idx); }; + auto get_input_field = [](nonnegative_int idx) { + return fmt::format("i{}", idx); + }; - auto get_output_field = [](int idx) { return fmt::format("o{}", idx); }; + auto get_output_field = [](nonnegative_int idx) { + return fmt::format("o{}", idx); + }; auto get_graph_input_name = [](DataflowGraphInput i) { return fmt::format("gi{}", i.idx); diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.cc index cad00c71e1..728dc75678 100644 --- a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.cc +++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.cc @@ -27,13 +27,13 @@ std::vector get_incoming_edges(OpenDataflowGraphView const &g, DataflowInputEdgeQuery{ query_set::matchall(), {n}, - query_set::matchall(), + query_set::matchall(), }, DataflowEdgeQuery{ query_set::matchall(), - query_set::matchall(), + query_set::matchall(), {n}, - query_set::matchall(), + query_set::matchall(), }, }), [](OpenDataflowEdge const &l, OpenDataflowEdge const &r) { diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph_incoming_edges.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph_incoming_edges.cc index 95a8e095fc..6448da9c73 100644 --- a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph_incoming_edges.cc +++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph_incoming_edges.cc @@ -13,13 +13,13 @@ std::unordered_set DataflowInputEdgeQuery{ query_set::matchall(), query_set{ns}, - query_set::matchall(), + query_set::matchall(), }, DataflowEdgeQuery{ query_set{nodes_not_in_ns}, - query_set::matchall(), + query_set::matchall(), query_set{ns}, - query_set::matchall(), + query_set::matchall(), }, }; diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.cc new file mode 100644 index 0000000000..c55c4fe360 --- /dev/null +++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.cc @@ -0,0 +1,54 @@ +#include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h" +#include "utils/overload.h" + +namespace FlexFlow { + +OpenDataflowValue isomorphism_map_r_open_dataflow_value_from_l( + OpenDataflowGraphIsomorphism const &iso, OpenDataflowValue const &l_value) { + return l_value.visit(overload{ + [&](DataflowGraphInput const &l_input) { + return OpenDataflowValue{ + iso.input_mapping.at_l(l_input), + }; + }, + [&](DataflowOutput const &l_output) { + return OpenDataflowValue{ + isomorphism_map_r_dataflow_output_from_l(iso, l_output), + }; + }, + }); +} + +OpenDataflowValue isomorphism_map_l_open_dataflow_value_from_r( + OpenDataflowGraphIsomorphism const &iso, OpenDataflowValue const &r_value) { + return r_value.visit(overload{ + [&](DataflowGraphInput const &r_input) { + return OpenDataflowValue{ + iso.input_mapping.at_r(r_input), + }; + }, + [&](DataflowOutput const &r_output) { + return OpenDataflowValue{ + isomorphism_map_l_dataflow_output_from_r(iso, r_output), + }; + }, + }); +} + +DataflowOutput isomorphism_map_r_dataflow_output_from_l( + OpenDataflowGraphIsomorphism const &iso, DataflowOutput const &l_output) { + return DataflowOutput{ + iso.node_mapping.at_l(l_output.node), + l_output.idx, + }; +} + +DataflowOutput isomorphism_map_l_dataflow_output_from_r( + OpenDataflowGraphIsomorphism const &iso, DataflowOutput const &r_output) { + return DataflowOutput{ + iso.node_mapping.at_r(r_output.node), + r_output.idx, + }; +} + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/dataflow_input_edge_query.cc b/lib/utils/src/utils/graph/open_dataflow_graph/dataflow_input_edge_query.cc index 8736f2d157..34adea6b09 100644 --- a/lib/utils/src/utils/graph/open_dataflow_graph/dataflow_input_edge_query.cc +++ b/lib/utils/src/utils/graph/open_dataflow_graph/dataflow_input_edge_query.cc @@ -6,14 +6,14 @@ DataflowInputEdgeQuery dataflow_input_edge_query_all() { return DataflowInputEdgeQuery{ query_set::matchall(), query_set::matchall(), - query_set::matchall(), + query_set::matchall(), }; } DataflowInputEdgeQuery dataflow_input_edge_query_none() { return DataflowInputEdgeQuery{ query_set::match_none(), query_set::match_none(), - query_set::match_none(), + query_set::match_none(), }; } @@ -28,7 +28,7 @@ DataflowInputEdgeQuery return DataflowInputEdgeQuery{ query_set{e.src}, query_set{e.dst.node}, - query_set{e.dst.idx}, + query_set{e.dst.idx}, }; } @@ -37,7 +37,7 @@ DataflowInputEdgeQuery return DataflowInputEdgeQuery{ query_set{src}, query_set::matchall(), - query_set::matchall(), + query_set::matchall(), }; } @@ -46,7 +46,7 @@ DataflowInputEdgeQuery return DataflowInputEdgeQuery{ query_set::matchall(), query_set{dst.node}, - query_set{dst.idx}, + query_set{dst.idx}, }; } diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_edge.cc b/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_edge.cc index d5e5b614af..d51562a6c6 100644 --- a/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_edge.cc +++ b/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_edge.cc @@ -7,7 +7,7 @@ Node get_open_dataflow_edge_dst_node(OpenDataflowEdge const &e) { return get_open_dataflow_edge_dst(e).node; } -int get_open_dataflow_edge_dst_idx(OpenDataflowEdge const &e) { +nonnegative_int get_open_dataflow_edge_dst_idx(OpenDataflowEdge const &e) { return get_open_dataflow_edge_dst(e).idx; } diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_graph.cc b/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_graph.cc index 63222dd360..949f837665 100644 --- a/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_graph.cc +++ b/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_graph.cc @@ -4,7 +4,7 @@ namespace FlexFlow { NodeAddedResult OpenDataflowGraph::add_node(std::vector const &inputs, - int num_outputs) { + nonnegative_int num_outputs) { return this->get_interface().add_node(inputs, num_outputs); } diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.cc b/lib/utils/src/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.cc index 0fdb2f408b..171b321c66 100644 --- a/lib/utils/src/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.cc +++ b/lib/utils/src/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.cc @@ -18,7 +18,7 @@ UnorderedSetOpenDataflowGraph::UnorderedSetOpenDataflowGraph( outputs(outputs), graph_inputs(graph_inputs) {} NodeAddedResult UnorderedSetOpenDataflowGraph::add_node( - std::vector const &inputs, int num_outputs) { + std::vector const &inputs, nonnegative_int num_outputs) { NOT_IMPLEMENTED(); } diff --git a/lib/utils/src/utils/graph/render_dot.cc b/lib/utils/src/utils/graph/render_dot.cc new file mode 100644 index 0000000000..8bdc001c80 --- /dev/null +++ b/lib/utils/src/utils/graph/render_dot.cc @@ -0,0 +1,90 @@ +#include "utils/graph/render_dot.h" +#include "utils/containers/flatmap.h" +#include "utils/containers/try_at.h" +#include "utils/graph/dataflow_graph/algorithms.h" +#include "utils/graph/node/algorithms.h" +#include "utils/record_formatter.h" + +namespace FlexFlow { + +std::string escape_dot_string(std::string const &s) { + auto escape_dot_char = [](char c) -> std::string { + switch (c) { + case '\\': + case '"': + return std::string{'\\'} + c; + default: + return std::string{c}; + } + }; + + return flatmap(s, escape_dot_char); +} + +std::string render_dot_node_attrs( + std::unordered_map const &node_attrs) { + std::ostringstream oss; + for (auto const &[k, v] : node_attrs) { + oss << fmt::format( + "\"{}\"=\"{}\",", escape_dot_string(k), escape_dot_string(v)); + } + return oss.str(); +} + +std::string render_node_label( + LabelledDataflowGraphView, + std::string> const &g, + Node const &n) { + std::vector n_inputs = get_dataflow_inputs(g, n); + std::vector n_outputs = get_outputs(g, n); + + RecordFormatter inputs_record; + for (DataflowInput const &i : n_inputs) { + inputs_record << fmt::format("{}", i.idx, i.idx); + } + + RecordFormatter outputs_record; + for (DataflowOutput const &o : n_outputs) { + outputs_record << fmt::format("{}", o.idx, g.at(o)); + } + + RecordFormatter rec; + rec << inputs_record + << try_at(g.at(n), std::string{"label"}) + .value_or(fmt::to_string(n.raw_uid)) + << outputs_record; + + std::ostringstream oss; + oss << rec; + return oss.str(); +} + +std::string render_dot( + LabelledDataflowGraphView, + std::string> const &g) { + std::vector lines; + lines.push_back("digraph {"); + + for (Node const &n : get_nodes(g)) { + std::unordered_map node_attrs = g.at(n); + node_attrs.at("label") = render_node_label(g, n); + node_attrs["shape"] = "record"; + + lines.push_back(fmt::format( + " n{} [{}];", n.raw_uid, render_dot_node_attrs(node_attrs))); + } + + for (DataflowEdge const &e : get_edges(g)) { + lines.push_back(fmt::format(" n{}:o{} -> n{}:i{};", + e.src.node.raw_uid, + e.src.idx, + e.dst.node.raw_uid, + e.dst.idx)); + } + + lines.push_back("}"); + + return join_strings(lines, "\n"); +} + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/nonnegative_int/ceildiv.cc b/lib/utils/src/utils/nonnegative_int/ceildiv.cc new file mode 100644 index 0000000000..f1115b25b5 --- /dev/null +++ b/lib/utils/src/utils/nonnegative_int/ceildiv.cc @@ -0,0 +1,20 @@ +#include "utils/nonnegative_int/ceildiv.h" +#include "utils/exception.h" + +namespace FlexFlow { + +nonnegative_int ceildiv(nonnegative_int numerator, + nonnegative_int denominator) { + if (denominator == 0) { + throw mk_runtime_error(fmt::format( + "ceildiv expected denominator != 0, but received {}", denominator)); + } + + int n = numerator.unwrap_nonnegative(); + int d = denominator.unwrap_nonnegative(); + + int result = (n + d - 1) / d; + return nonnegative_int{result}; +} + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc index 9088cc4bf9..e86c242250 100644 --- a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc +++ b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc @@ -1,4 +1,5 @@ #include "utils/nonnegative_int/nonnegative_int.h" +#include "utils/exception.h" namespace FlexFlow { @@ -10,6 +11,15 @@ nonnegative_int::nonnegative_int(int value) { this->value_ = value; } +nonnegative_int::nonnegative_int(size_t value) { + if (value > std::numeric_limits::max()) { + throw std::invalid_argument(fmt::format( + "Input {} to nonnegative_int(size_t) is out-of-bounds for int", value)); + } + this->value_ = static_cast(value); + assert(this->value_ >= 0); +} + nonnegative_int::operator int() const noexcept { return this->value_; } @@ -75,18 +85,72 @@ nonnegative_int nonnegative_int::operator+(nonnegative_int const &other) const { return nonnegative_int{this->value_ + other.value_}; } +nonnegative_int &nonnegative_int::operator++() { + this->value_++; + return *this; +} + +nonnegative_int nonnegative_int::operator++(int) { + nonnegative_int result = *this; + this->value_++; + return result; +} + +nonnegative_int &nonnegative_int::operator+=(nonnegative_int const &other) { + this->value_ += other.value_; + return *this; +} + +nonnegative_int nonnegative_int::operator*(nonnegative_int const &other) const { + return nonnegative_int{this->value_ * other.value_}; +} + +nonnegative_int &nonnegative_int::operator*=(nonnegative_int const &other) { + this->value_ *= other.value_; + return *this; +} + +nonnegative_int nonnegative_int::operator/(nonnegative_int const &other) const { + return nonnegative_int{this->value_ / other.value_}; +} + +nonnegative_int &nonnegative_int::operator/=(nonnegative_int const &other) { + this->value_ /= other.value_; + return *this; +} + +nonnegative_int nonnegative_int::operator%(nonnegative_int const &other) const { + return nonnegative_int{this->value_ % other.value_}; +} + +nonnegative_int &nonnegative_int::operator%=(nonnegative_int const &other) { + this->value_ %= other.value_; + return *this; +} + std::ostream &operator<<(std::ostream &os, nonnegative_int const &n) { os << n.value_; return os; } -int nonnegative_int::get_value() const { +int nonnegative_int::unwrap_nonnegative() const { return this->value_; } int format_as(nonnegative_int const &x) { - return x.get_value(); + return x.unwrap_nonnegative(); } + +nonnegative_int operator""_n(unsigned long long int x) { + if (x > + static_cast(std::numeric_limits::max())) { + throw mk_runtime_error( + fmt::format("Value too large to wrap as nonnegative_int: {}", x)); + } + + return nonnegative_int{static_cast(x)}; +} + } // namespace FlexFlow namespace nlohmann { @@ -97,13 +161,20 @@ ::FlexFlow::nonnegative_int void adl_serializer<::FlexFlow::nonnegative_int>::to_json( json &j, ::FlexFlow::nonnegative_int t) { - j = t.get_value(); + j = t.unwrap_nonnegative(); } } // namespace nlohmann +namespace rc { +Gen<::FlexFlow::nonnegative_int> + Arbitrary<::FlexFlow::nonnegative_int>::arbitrary() { + return gen::construct<::FlexFlow::nonnegative_int>(gen::nonNegative()); +} +} // namespace rc + namespace std { std::size_t hash<::FlexFlow::nonnegative_int>::operator()( FlexFlow::nonnegative_int const &n) const noexcept { - return std::hash{}(n.get_value()); + return std::hash{}(n.unwrap_nonnegative()); } } // namespace std diff --git a/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc b/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc new file mode 100644 index 0000000000..f31db6d589 --- /dev/null +++ b/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc @@ -0,0 +1,19 @@ +#include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/containers/range.h" +#include "utils/containers/transform.h" + +namespace FlexFlow { + +std::vector nonnegative_range(nonnegative_int end) { + return transform(range(end.unwrap_nonnegative()), + [](int x) { return nonnegative_int{x}; }); +} + +std::vector + nonnegative_range(nonnegative_int start, nonnegative_int end, int step) { + return transform( + range(start.unwrap_nonnegative(), end.unwrap_nonnegative(), step), + [](int x) { return nonnegative_int{x}; }); +} + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/nonnegative_int/num_elements.cc b/lib/utils/src/utils/nonnegative_int/num_elements.cc new file mode 100644 index 0000000000..21292bf2ab --- /dev/null +++ b/lib/utils/src/utils/nonnegative_int/num_elements.cc @@ -0,0 +1,10 @@ +#include "utils/nonnegative_int/num_elements.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using E = value_type<0>; + +template nonnegative_int num_elements(std::vector const &); + +} // namespace FlexFlow diff --git a/lib/utils/test/src/main.cc b/lib/utils/test/src/main.cc deleted file mode 100644 index 9522fa7fdb..0000000000 --- a/lib/utils/test/src/main.cc +++ /dev/null @@ -1,2 +0,0 @@ -#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN -#include "doctest/doctest.h" diff --git a/lib/utils/test/src/utils/bidict/algorithms/bidict_from_enumerating.cc b/lib/utils/test/src/utils/bidict/algorithms/bidict_from_enumerating.cc index b5a373e5c9..a669869fb8 100644 --- a/lib/utils/test/src/utils/bidict/algorithms/bidict_from_enumerating.cc +++ b/lib/utils/test/src/utils/bidict/algorithms/bidict_from_enumerating.cc @@ -10,10 +10,12 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("bidict_from_enumerating(std::unordered_set)") { std::unordered_set input = {"zero", "one", "two"}; - bidict result = bidict_from_enumerating(input); + bidict result = + bidict_from_enumerating(input); - std::unordered_set result_left_entries = left_entries(result); - std::unordered_set correct_left_entries = {0, 1, 2}; + std::unordered_set result_left_entries = + left_entries(result); + std::unordered_set correct_left_entries = {0_n, 1_n, 2_n}; CHECK(result_left_entries == correct_left_entries); std::unordered_set result_right_entries = @@ -25,13 +27,14 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("bidict_from_enumerating(std::set)") { std::set input = {"a", "c", "b"}; - bidict correct = { - {0, "a"}, - {1, "b"}, - {2, "c"}, + bidict correct = { + {0_n, "a"}, + {1_n, "b"}, + {2_n, "c"}, }; - bidict result = bidict_from_enumerating(input); + bidict result = + bidict_from_enumerating(input); CHECK(result == correct); } diff --git a/lib/utils/test/src/utils/cli/cli_parse.cc b/lib/utils/test/src/utils/cli/cli_parse.cc index 40dea86ae0..72a09efbde 100644 --- a/lib/utils/test/src/utils/cli/cli_parse.cc +++ b/lib/utils/test/src/utils/cli/cli_parse.cc @@ -24,8 +24,8 @@ TEST_SUITE(FF_TEST_SUITE) { {}, }; - CLIFlagKey key_flag1 = CLIFlagKey{0}; - CLIFlagKey key_flag2 = CLIFlagKey{1}; + CLIFlagKey key_flag1 = CLIFlagKey{0_n}; + CLIFlagKey key_flag2 = CLIFlagKey{1_n}; SUBCASE("correctly parses short flag") { std::string input = "-2"; @@ -94,8 +94,8 @@ TEST_SUITE(FF_TEST_SUITE) { }, {}, }; - CLIFlagKey key_flag1 = CLIFlagKey{0}; - CLIFlagKey key_flag2 = CLIFlagKey{1}; + CLIFlagKey key_flag1 = CLIFlagKey{0_n}; + CLIFlagKey key_flag2 = CLIFlagKey{1_n}; SUBCASE("parses flags in any order") { std::vector inputs = {"prog_name", "-2", "--flag1"}; @@ -180,8 +180,8 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0}; - CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1}; + CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0_n}; + CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1_n}; SUBCASE("can parse multiple positional arguments") { std::vector inputs = {"prog_name", "hello", "world"}; @@ -266,7 +266,7 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - CLIPositionalArgumentKey key_posarg = CLIPositionalArgumentKey{0}; + CLIPositionalArgumentKey key_posarg = CLIPositionalArgumentKey{0_n}; SUBCASE( "succeeds if a positional argument is set to a valid choice") { @@ -351,11 +351,11 @@ TEST_SUITE(FF_TEST_SUITE) { }, }, }; - CLIFlagKey key_flag1 = CLIFlagKey{0}; - CLIFlagKey key_flag2 = CLIFlagKey{1}; - CLIFlagKey key_flag3 = CLIFlagKey{2}; - CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0}; - CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1}; + CLIFlagKey key_flag1 = CLIFlagKey{0_n}; + CLIFlagKey key_flag2 = CLIFlagKey{1_n}; + CLIFlagKey key_flag3 = CLIFlagKey{2_n}; + CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0_n}; + CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1_n}; SUBCASE("works if flags are before positional arguments") { std::vector inputs = { @@ -449,11 +449,11 @@ TEST_SUITE(FF_TEST_SUITE) { }, }, }; - CLIFlagKey key_flag1 = CLIFlagKey{0}; - CLIFlagKey key_flag2 = CLIFlagKey{1}; - CLIFlagKey key_flag3 = CLIFlagKey{2}; - CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0}; - CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1}; + CLIFlagKey key_flag1 = CLIFlagKey{0_n}; + CLIFlagKey key_flag2 = CLIFlagKey{1_n}; + CLIFlagKey key_flag3 = CLIFlagKey{2_n}; + CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0_n}; + CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1_n}; int argc = 5; char const *argv[] = {"prog_name", "red", "-f", "world", "--flag3"}; diff --git a/lib/utils/test/src/utils/containers/at_idx.cc b/lib/utils/test/src/utils/containers/at_idx.cc new file mode 100644 index 0000000000..b2a6286b62 --- /dev/null +++ b/lib/utils/test/src/utils/containers/at_idx.cc @@ -0,0 +1,29 @@ +#include "utils/containers/at_idx.h" +#include "test/utils/doctest/fmt/optional.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("at_idx(std::vector, nonnegative_int)") { + std::vector vec = {1, 3, 2, 3}; + + SUBCASE("idx is in bounds") { + nonnegative_int idx = 1_n; + + std::optional result = at_idx(vec, idx); + std::optional correct = 3; + + CHECK(result == correct); + } + + SUBCASE("idx is out of bounds") { + nonnegative_int idx = 4_n; + + std::optional result = at_idx(vec, idx); + std::optional correct = std::nullopt; + + CHECK(result == correct); + } + } +} diff --git a/lib/utils/test/src/utils/containers/enumerate.cc b/lib/utils/test/src/utils/containers/enumerate.cc index 2f9a5b3c02..2fdb2e481e 100644 --- a/lib/utils/test/src/utils/containers/enumerate.cc +++ b/lib/utils/test/src/utils/containers/enumerate.cc @@ -17,26 +17,27 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("enumerate(std::vector)") { std::vector input = {"zero", "one", "two", "three"}; - std::map correct = { - {0, "zero"}, - {1, "one"}, - {2, "two"}, - {3, "three"}, + std::map correct = { + {0_n, "zero"}, + {1_n, "one"}, + {2_n, "two"}, + {3_n, "three"}, }; - std::map result = enumerate(input); + std::map result = enumerate(input); CHECK(result == correct); SUBCASE("check iteration order") { - std::vector> iterated_result = - vector_of(result); - std::vector> correct_iteration_order = { - {0, "zero"}, - {1, "one"}, - {2, "two"}, - {3, "three"}, - }; + std::vector> + iterated_result = vector_of(result); + std::vector> + correct_iteration_order = { + {0_n, "zero"}, + {1_n, "one"}, + {2_n, "two"}, + {3_n, "three"}, + }; CHECK(iterated_result == correct_iteration_order); } @@ -45,9 +46,9 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("enumerate(std::unordered_set)") { std::unordered_set input = {"A", "B", "C", "D"}; - std::unordered_set correct_keys = {0, 1, 2, 3}; + std::unordered_set correct_keys = {0_n, 1_n, 2_n, 3_n}; std::unordered_multiset correct_values = {"A", "B", "C", "D"}; - std::map result = enumerate(input); + std::map result = enumerate(input); CHECK(keys(result) == correct_keys); CHECK(unordered_multiset_of(values(result)) == correct_values); diff --git a/lib/utils/test/src/utils/containers/enumerate_vector.cc b/lib/utils/test/src/utils/containers/enumerate_vector.cc new file mode 100644 index 0000000000..fa5c5cf6fb --- /dev/null +++ b/lib/utils/test/src/utils/containers/enumerate_vector.cc @@ -0,0 +1,33 @@ +#include "utils/containers/enumerate_vector.h" +#include "test/utils/doctest/fmt/map.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("enumerate_vector(std::vector)") { + SUBCASE("input vector is empty") { + std::vector input = {}; + + std::map result = enumerate_vector(input); + std::map correct = {}; + + CHECK(result == correct); + } + + SUBCASE("input vector is not empty") { + std::vector input = {2, 3, 1, 3, 3}; + + std::map result = enumerate_vector(input); + std::map correct = { + {0_n, 2}, + {1_n, 3}, + {2_n, 1}, + {3_n, 3}, + {4_n, 3}, + }; + + CHECK(result == correct); + } + } +} diff --git a/lib/utils/test/src/utils/containers/flatmap.cc b/lib/utils/test/src/utils/containers/flatmap.cc index bd6d3ae5be..6a6d3c86a8 100644 --- a/lib/utils/test/src/utils/containers/flatmap.cc +++ b/lib/utils/test/src/utils/containers/flatmap.cc @@ -73,6 +73,38 @@ TEST_SUITE(FF_TEST_SUITE) { } } + TEST_CASE("flatmap(std::string, F)") { + std::string input = "aBabcBc"; + + SUBCASE("replacement length > 1") { + std::string result = flatmap(input, [](char c) -> std::string { + if (c == 'B') { + return ".."; + } else { + return std::string{c}; + } + }); + + std::string correct = "a..abc..c"; + + CHECK(result == correct); + } + + SUBCASE("replacement length == 0") { + std::string result = flatmap(input, [](char c) -> std::string { + if (c == 'B') { + return ""; + } else { + return std::string{c}; + } + }); + + std::string correct = "aabcc"; + + CHECK(result == correct); + } + } + TEST_CASE("flatmap(std::unordered_map, F)") { auto de_nest_keys = [](int k1, std::unordered_map const &v) { diff --git a/lib/utils/test/src/utils/containers/get_all_permutations_with_repetition.cc b/lib/utils/test/src/utils/containers/get_all_permutations_with_repetition.cc index f25bcf65b1..9fb4048691 100644 --- a/lib/utils/test/src/utils/containers/get_all_permutations_with_repetition.cc +++ b/lib/utils/test/src/utils/containers/get_all_permutations_with_repetition.cc @@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector input = {1, 2, 3}; std::unordered_multiset> result = - get_all_permutations_with_repetition(input, 1); + get_all_permutations_with_repetition(input, 1_n); std::unordered_multiset> correct = { {1}, {2}, @@ -27,7 +27,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector input = {1}; std::unordered_multiset> result = - get_all_permutations_with_repetition(input, 2); + get_all_permutations_with_repetition(input, 2_n); std::unordered_multiset> correct = { {1, 1}, }; @@ -39,7 +39,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector input = {1, 2}; std::unordered_multiset> result = - get_all_permutations_with_repetition(input, 3); + get_all_permutations_with_repetition(input, 3_n); std::unordered_multiset> correct = { {1, 1, 1}, {1, 1, 2}, @@ -58,7 +58,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector input = {1, 2, 2}; std::unordered_multiset> result = - get_all_permutations_with_repetition(input, 2); + get_all_permutations_with_repetition(input, 2_n); std::unordered_multiset> correct = {{1, 1}, {1, 2}, {1, 2}, diff --git a/lib/utils/test/src/utils/containers/make.cc b/lib/utils/test/src/utils/containers/make.cc new file mode 100644 index 0000000000..4070f5b35a --- /dev/null +++ b/lib/utils/test/src/utils/containers/make.cc @@ -0,0 +1,15 @@ +#include "utils/containers/make.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("make") { + auto f = make(); + + int result = f(true); + int correct = 1; + + CHECK(result == correct); + } +} diff --git a/lib/utils/test/src/utils/containers/merge_maps.cc b/lib/utils/test/src/utils/containers/merge_maps.cc index a083e94de3..4ec8054892 100644 --- a/lib/utils/test/src/utils/containers/merge_maps.cc +++ b/lib/utils/test/src/utils/containers/merge_maps.cc @@ -1,30 +1,80 @@ #include "utils/containers/merge_maps.h" #include "test/utils/doctest/fmt/unordered_map.h" #include -#include using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("merge_disjoint_maps") { + std::unordered_map l_map = { + {1, "one"}, + {2, "two"}, + }; - TEST_CASE("merge_maps") { + std::unordered_map r_map = { + {3, "three"}, + }; - SUBCASE("disjoint keys") { - std::unordered_map lhs = {{1, "one"}, {2, "two"}}; - std::unordered_map rhs = {{3, "three"}, {4, "four"}}; - - std::unordered_map result = merge_maps(lhs, rhs); - std::unordered_map correct = { - {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"}}; + std::unordered_map correct = { + {1, "one"}, + {2, "two"}, + {3, "three"}, + }; + SUBCASE("maps are disjoint") { + std::unordered_map result = + merge_disjoint_maps(l_map, r_map); CHECK(result == correct); } - SUBCASE("overlapping keys") { - std::unordered_map lhs = {{1, "one"}, {2, "two"}}; - std::unordered_map rhs = {{2, "three"}, {3, "four"}}; - - CHECK_THROWS(merge_maps(lhs, rhs)); + SUBCASE("maps are not disjoint") { + CHECK_THROWS(merge_disjoint_maps(l_map, l_map)); } } + + TEST_CASE("merge_map_left_dominates") { + std::unordered_map l_map = { + {1, "one"}, + {2, "left_two"}, + }; + + std::unordered_map r_map = { + {2, "right_two"}, + {3, "three"}, + }; + + std::unordered_map correct = { + {1, "one"}, + {2, "left_two"}, + {3, "three"}, + }; + + std::unordered_map result = + merge_map_left_dominates(l_map, r_map); + + CHECK(result == correct); + } + + TEST_CASE("merge_map_right_dominates") { + std::unordered_map l_map = { + {1, "one"}, + {2, "left_two"}, + }; + + std::unordered_map r_map = { + {2, "right_two"}, + {3, "three"}, + }; + + std::unordered_map correct = { + {1, "one"}, + {2, "right_two"}, + {3, "three"}, + }; + + std::unordered_map result = + merge_map_right_dominates(l_map, r_map); + + CHECK(result == correct); + } } diff --git a/lib/utils/test/src/utils/containers/product.cc b/lib/utils/test/src/utils/containers/product.cc index 3fa94c8e9e..2278bfba17 100644 --- a/lib/utils/test/src/utils/containers/product.cc +++ b/lib/utils/test/src/utils/containers/product.cc @@ -1,4 +1,6 @@ #include "utils/containers/product.h" +#include "utils/nonnegative_int/nonnegative_int.h" +#include #include #include #include @@ -29,4 +31,22 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(correct == result); } } + + TEST_CASE("product(std::vector)") { + SUBCASE("non-empty container") { + std::vector input = {1_n, 2_n, 3_n, 5_n}; + nonnegative_int correct = 30_n; + auto result = product(input); + CHECK(correct == result); + } + + SUBCASE("empty container") { + std::vector input = {5_n}; + nonnegative_int correct = 5_n; + // correct = nonnegative_int{x}; + // CHECK(x == 3); + nonnegative_int result = product(input); + CHECK(correct == correct); + } + } } diff --git a/lib/utils/test/src/utils/containers/repeat.cc b/lib/utils/test/src/utils/containers/repeat.cc index d8ffe76a64..d2fc595f49 100644 --- a/lib/utils/test/src/utils/containers/repeat.cc +++ b/lib/utils/test/src/utils/containers/repeat.cc @@ -7,7 +7,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("repeat") { int x = 0; - std::vector result = repeat(3, [&]() { + std::vector result = repeat(3_n, [&]() { int result = x; x += 2; return result; diff --git a/lib/utils/test/src/utils/containers/replicate.cc b/lib/utils/test/src/utils/containers/repeat_element.cc similarity index 69% rename from lib/utils/test/src/utils/containers/replicate.cc rename to lib/utils/test/src/utils/containers/repeat_element.cc index 1c7845642e..08bee8bec8 100644 --- a/lib/utils/test/src/utils/containers/replicate.cc +++ b/lib/utils/test/src/utils/containers/repeat_element.cc @@ -1,4 +1,4 @@ -#include "utils/containers/replicate.h" +#include "utils/containers/repeat_element.h" #include "test/utils/doctest/fmt/unordered_set.h" #include "test/utils/doctest/fmt/vector.h" #include @@ -7,16 +7,17 @@ using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("replicate") { + TEST_CASE("repeat_element") { SUBCASE("ints") { int x = 42; - std::vector result = replicate(5, x); + std::vector result = repeat_element(nonnegative_int{5}, x); std::vector correct = {42, 42, 42, 42, 42}; CHECK(result == correct); } SUBCASE("unordered_set") { std::unordered_set x = {1.0, 1.5}; - std::vector> result = replicate(3, x); + std::vector> result = + repeat_element(nonnegative_int{3}, x); std::vector> correct = { {1.0, 1.5}, {1.0, 1.5}, {1.0, 1.5}}; CHECK(result == correct); diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms.cc index 25f990f80e..ff491f6b85 100644 --- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms.cc +++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms.cc @@ -12,19 +12,19 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_inputs/get_outputs") { DataflowGraph g = DataflowGraph::create(); - NodeAddedResult n1_added = g.add_node({}, 1); + NodeAddedResult n1_added = g.add_node({}, 1_n); Node n1 = n1_added.node; DataflowOutput o1 = get_only(n1_added.outputs); - NodeAddedResult n2_added = g.add_node({}, 1); + NodeAddedResult n2_added = g.add_node({}, 1_n); Node n2 = n2_added.node; DataflowOutput o2 = get_only(n2_added.outputs); - NodeAddedResult n3_added = g.add_node({}, 1); + NodeAddedResult n3_added = g.add_node({}, 1_n); Node n3 = n3_added.node; DataflowOutput o3 = get_only(n3_added.outputs); - NodeAddedResult n4_added = g.add_node({o1, o2, o3}, 1); + NodeAddedResult n4_added = g.add_node({o1, o2, o3}, 1_n); Node n4 = n4_added.node; DataflowOutput o4 = get_only(n4_added.outputs); @@ -44,15 +44,15 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("topological_ordering") { DataflowGraph g = DataflowGraph::create(); - NodeAddedResult n1_added = g.add_node({}, 1); + NodeAddedResult n1_added = g.add_node({}, 1_n); Node n1 = n1_added.node; DataflowOutput o1 = get_only(n1_added.outputs); - NodeAddedResult n2_added = g.add_node({o1}, 1); + NodeAddedResult n2_added = g.add_node({o1}, 1_n); Node n2 = n2_added.node; DataflowOutput o2 = get_only(n2_added.outputs); - NodeAddedResult n3_added = g.add_node({o2}, 1); + NodeAddedResult n3_added = g.add_node({o2}, 1_n); Node n3 = n3_added.node; DataflowOutput o3 = get_only(n3_added.outputs); diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/dataflow_graphs_are_isomorphic.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/dataflow_graphs_are_isomorphic.cc index f991b4a65e..0f812f2dec 100644 --- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/dataflow_graphs_are_isomorphic.cc +++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/dataflow_graphs_are_isomorphic.cc @@ -11,21 +11,21 @@ TEST_SUITE(FF_TEST_SUITE) { "dataflow_graphs_are_isomorphic(DataflowGraphView, DataflowGraphView)") { auto g1 = DataflowGraph::create(); - NodeAddedResult g1_n1_added = g1.add_node({}, 1); + NodeAddedResult g1_n1_added = g1.add_node({}, 1_n); Node g1_n1_node = g1_n1_added.node; DataflowOutput g1_n1_output = get_only(g1_n1_added.outputs); - NodeAddedResult g1_n2_added = g1.add_node({g1_n1_output}, 1); + NodeAddedResult g1_n2_added = g1.add_node({g1_n1_output}, 1_n); Node g1_n2_node = g1_n2_added.node; auto g2 = DataflowGraph::create(); SUBCASE("input graphs are isomorphic") { - NodeAddedResult g2_n1_added = g2.add_node({}, 1); + NodeAddedResult g2_n1_added = g2.add_node({}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); - NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1); + NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n); Node g2_n2_node = g2_n2_added.node; bool correct = true; @@ -36,12 +36,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("input graphs are not isomorphic (different connectivity)") { - NodeAddedResult g2_n1_added = g2.add_node({}, 1); + NodeAddedResult g2_n1_added = g2.add_node({}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); NodeAddedResult g2_n2_added = - g2.add_node({g2_n1_output, g2_n1_output}, 1); + g2.add_node({g2_n1_output, g2_n1_output}, 1_n); Node g2_n2_node = g2_n2_added.node; bool correct = false; @@ -53,14 +53,14 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("input graphs are not isomorphic (different number of src and sink " "nodes)") { - NodeAddedResult g2_n1_added = g2.add_node({}, 1); + NodeAddedResult g2_n1_added = g2.add_node({}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); - NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1); + NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n); Node g2_n2_node = g2_n2_added.node; - NodeAddedResult g2_n3_added = g2.add_node({}, 1); + NodeAddedResult g2_n3_added = g2.add_node({}, 1_n); Node g2_n3_node = g2_n3_added.node; bool correct = false; @@ -72,15 +72,15 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("input graphs are not isomorphic (different number of internal " "nodes)") { - NodeAddedResult g2_n1_added = g2.add_node({}, 1); + NodeAddedResult g2_n1_added = g2.add_node({}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); - NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1); + NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n); Node g2_n2_node = g2_n2_added.node; DataflowOutput g2_n2_output = get_only(g2_n2_added.outputs); - NodeAddedResult g2_n3_added = g2.add_node({g2_n2_output}, 1); + NodeAddedResult g2_n3_added = g2.add_node({g2_n2_output}, 1_n); Node g2_n3_node = g2_n3_added.node; bool correct = false; diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/find_isomorphism.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/find_isomorphism.cc index 160e4c4f73..8974d09832 100644 --- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/find_isomorphism.cc +++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/find_isomorphism.cc @@ -10,21 +10,21 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("find_isomorphism(DataflowGraphView, DataflowGraphView)") { auto g1 = DataflowGraph::create(); - NodeAddedResult g1_n1_added = g1.add_node({}, 1); + NodeAddedResult g1_n1_added = g1.add_node({}, 1_n); Node g1_n1_node = g1_n1_added.node; DataflowOutput g1_n1_output = get_only(g1_n1_added.outputs); - NodeAddedResult g1_n2_added = g1.add_node({g1_n1_output}, 1); + NodeAddedResult g1_n2_added = g1.add_node({g1_n1_output}, 1_n); Node g1_n2_node = g1_n2_added.node; auto g2 = DataflowGraph::create(); SUBCASE("input graphs are isomorphic") { - NodeAddedResult g2_n1_added = g2.add_node({}, 1); + NodeAddedResult g2_n1_added = g2.add_node({}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); - NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1); + NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n); Node g2_n2_node = g2_n2_added.node; std::optional correct_isomorphism = @@ -41,12 +41,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("input graphs are not isomorphic (different connectivity)") { - NodeAddedResult g2_n1_added = g2.add_node({}, 1); + NodeAddedResult g2_n1_added = g2.add_node({}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); NodeAddedResult g2_n2_added = - g2.add_node({g2_n1_output, g2_n1_output}, 1); + g2.add_node({g2_n1_output, g2_n1_output}, 1_n); Node g2_n2_node = g2_n2_added.node; std::optional correct_isomorphism = @@ -59,14 +59,14 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("input graphs are not isomorphic (different number of src and sink " "nodes)") { - NodeAddedResult g2_n1_added = g2.add_node({}, 1); + NodeAddedResult g2_n1_added = g2.add_node({}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); - NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1); + NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n); Node g2_n2_node = g2_n2_added.node; - NodeAddedResult g2_n3_added = g2.add_node({}, 0); + NodeAddedResult g2_n3_added = g2.add_node({}, 0_n); Node g2_n3_node = g2_n3_added.node; std::optional correct_isomorphism = @@ -79,15 +79,15 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("input graphs are not isomorphic (different number of internal " "nodes)") { - NodeAddedResult g2_n1_added = g2.add_node({}, 1); + NodeAddedResult g2_n1_added = g2.add_node({}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); - NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1); + NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n); Node g2_n2_node = g2_n2_added.node; DataflowOutput g2_n2_output = get_only(g2_n2_added.outputs); - NodeAddedResult g2_n3_added = g2.add_node({g2_n2_output}, 1); + NodeAddedResult g2_n3_added = g2.add_node({g2_n2_output}, 1_n); Node g2_n3_node = g2_n3_added.node; std::optional correct_isomorphism = diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc index fec5d3401e..e619cc3b1c 100644 --- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc +++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc @@ -11,12 +11,12 @@ TEST_SUITE(FF_TEST_SUITE) { DataflowGraph g = DataflowGraph::create(); SUBCASE("gets edges if there are multiple") { - NodeAddedResult n1_added = g.add_node({}, 2); + NodeAddedResult n1_added = g.add_node({}, 2_n); Node n1 = n1_added.node; DataflowOutput n1_o0 = n1_added.outputs.at(0); DataflowOutput n1_o1 = n1_added.outputs.at(1); - NodeAddedResult n2_added = g.add_node({n1_o0, n1_o0, n1_o1}, 0); + NodeAddedResult n2_added = g.add_node({n1_o0, n1_o0, n1_o1}, 0_n); Node n2 = n2_added.node; std::unordered_set result = @@ -24,15 +24,15 @@ TEST_SUITE(FF_TEST_SUITE) { std::unordered_set correct = { DataflowEdge{ n1_o0, - DataflowInput{n2, 0}, + DataflowInput{n2, 0_n}, }, DataflowEdge{ n1_o0, - DataflowInput{n2, 1}, + DataflowInput{n2, 1_n}, }, DataflowEdge{ n1_o1, - DataflowInput{n2, 2}, + DataflowInput{n2, 2_n}, }, }; @@ -40,15 +40,15 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("does not get edges to/from other nodes") { - NodeAddedResult n1_added = g.add_node({}, 1); + NodeAddedResult n1_added = g.add_node({}, 1_n); Node n1 = n1_added.node; DataflowOutput o1 = get_only(n1_added.outputs); - NodeAddedResult n2_added = g.add_node({o1}, 1); + NodeAddedResult n2_added = g.add_node({o1}, 1_n); Node n2 = n2_added.node; DataflowOutput o2 = get_only(n2_added.outputs); - NodeAddedResult n3_added = g.add_node({o2}, 1); + NodeAddedResult n3_added = g.add_node({o2}, 1_n); Node n3 = n3_added.node; DataflowOutput o3 = get_only(n3_added.outputs); @@ -61,11 +61,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE( "does not get flipped edges (i.e., respects from vs to direction)") { - NodeAddedResult n1_added = g.add_node({}, 1); + NodeAddedResult n1_added = g.add_node({}, 1_n); Node n1 = n1_added.node; DataflowOutput o1 = get_only(n1_added.outputs); - NodeAddedResult n2_added = g.add_node({o1}, 0); + NodeAddedResult n2_added = g.add_node({o1}, 0_n); Node n2 = n2_added.node; std::unordered_set result = @@ -76,10 +76,10 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("returns empty set if no edges exist between the given nodes") { - NodeAddedResult n1_added = g.add_node({}, 1); + NodeAddedResult n1_added = g.add_node({}, 1_n); Node n1 = n1_added.node; - NodeAddedResult n2_added = g.add_node({}, 1); + NodeAddedResult n2_added = g.add_node({}, 1_n); Node n2 = n2_added.node; std::unordered_set result = @@ -91,7 +91,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("returns empty set if src node == dst node (as cycles cannot exist " "in DataflowGraph") { - NodeAddedResult n1_added = g.add_node({}, 1); + NodeAddedResult n1_added = g.add_node({}, 1_n); Node n1 = n1_added.node; std::unordered_set result = diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc index 86e4802cdb..f55afbacc1 100644 --- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc +++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc @@ -10,34 +10,34 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_incoming_edges(DataflowGraphView, Node)") { DataflowGraph g = DataflowGraph::create(); - NodeAddedResult n1_added = g.add_node({}, 1); + NodeAddedResult n1_added = g.add_node({}, 1_n); Node n1 = n1_added.node; DataflowOutput o1 = get_only(n1_added.outputs); - NodeAddedResult n2_added = g.add_node({}, 1); + NodeAddedResult n2_added = g.add_node({}, 1_n); Node n2 = n2_added.node; DataflowOutput o2 = get_only(n2_added.outputs); - NodeAddedResult n3_added = g.add_node({o2}, 1); + NodeAddedResult n3_added = g.add_node({o2}, 1_n); Node n3 = n3_added.node; DataflowOutput o3 = get_only(n3_added.outputs); - NodeAddedResult n4_added = g.add_node({o2, o3}, 1); + NodeAddedResult n4_added = g.add_node({o2, o3}, 1_n); Node n4 = n4_added.node; DataflowOutput o4 = get_only(n4_added.outputs); SUBCASE("n4 - multiple incoming edges") { std::vector result = get_incoming_edges(g, n4); std::vector correct = { - DataflowEdge{o2, DataflowInput{n4, 0}}, - DataflowEdge{o3, DataflowInput{n4, 1}}}; + DataflowEdge{o2, DataflowInput{n4, 0_n}}, + DataflowEdge{o3, DataflowInput{n4, 1_n}}}; CHECK(result == correct); } SUBCASE("n3- single incoming edge") { std::vector result = get_incoming_edges(g, n3); std::vector correct = { - DataflowEdge{o2, DataflowInput{n3, 0}}, + DataflowEdge{o2, DataflowInput{n3, 0_n}}, }; CHECK(result == correct); } diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc index be874b7e29..c37dcf5be7 100644 --- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc +++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc @@ -10,26 +10,26 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_outgoing_edges(DataflowGraphView, Node)") { DataflowGraph g = DataflowGraph::create(); - NodeAddedResult n1_added = g.add_node({}, 1); + NodeAddedResult n1_added = g.add_node({}, 1_n); Node n1 = n1_added.node; DataflowOutput o1 = get_only(n1_added.outputs); - NodeAddedResult n2_added = g.add_node({o1}, 1); + NodeAddedResult n2_added = g.add_node({o1}, 1_n); Node n2 = n2_added.node; DataflowOutput o2 = get_only(n2_added.outputs); - NodeAddedResult n3_added = g.add_node({o1}, 1); + NodeAddedResult n3_added = g.add_node({o1}, 1_n); Node n3 = n3_added.node; DataflowOutput o3 = get_only(n3_added.outputs); - NodeAddedResult n4_added = g.add_node({o2}, 1); + NodeAddedResult n4_added = g.add_node({o2}, 1_n); Node n4 = n4_added.node; DataflowOutput o4 = get_only(n4_added.outputs); SUBCASE("n2 - single outgoing edge") { std::unordered_set result = get_outgoing_edges(g, n2); std::unordered_set correct = { - DataflowEdge{o2, DataflowInput{n4, 0}}, + DataflowEdge{o2, DataflowInput{n4, 0_n}}, }; CHECK(result == correct); } @@ -37,8 +37,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("n1 - multiple outgoing edges") { std::unordered_set result = get_outgoing_edges(g, n1); std::unordered_set correct = { - DataflowEdge{o1, DataflowInput{n2, 0}}, - DataflowEdge{o1, DataflowInput{n3, 0}}, + DataflowEdge{o1, DataflowInput{n2, 0_n}}, + DataflowEdge{o1, DataflowInput{n3, 0_n}}, }; CHECK(result == correct); } @@ -53,19 +53,19 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_outgoing_edges(DataflowGraphView, std::unordered_set)") { DataflowGraph g = DataflowGraph::create(); - NodeAddedResult n1_added = g.add_node({}, 1); + NodeAddedResult n1_added = g.add_node({}, 1_n); Node n1 = n1_added.node; DataflowOutput o1 = get_only(n1_added.outputs); - NodeAddedResult n2_added = g.add_node({o1}, 1); + NodeAddedResult n2_added = g.add_node({o1}, 1_n); Node n2 = n2_added.node; DataflowOutput o2 = get_only(n2_added.outputs); - NodeAddedResult n3_added = g.add_node({o1}, 1); + NodeAddedResult n3_added = g.add_node({o1}, 1_n); Node n3 = n3_added.node; DataflowOutput o3 = get_only(n3_added.outputs); - NodeAddedResult n4_added = g.add_node({o2}, 1); + NodeAddedResult n4_added = g.add_node({o2}, 1_n); Node n4 = n4_added.node; DataflowOutput o4 = get_only(n4_added.outputs); @@ -73,9 +73,9 @@ TEST_SUITE(FF_TEST_SUITE) { std::unordered_set nodes = {n1, n2}; std::unordered_set result = get_outgoing_edges(g, nodes); std::unordered_set correct = { - DataflowEdge{o1, DataflowInput{n2, 0}}, - DataflowEdge{o1, DataflowInput{n3, 0}}, - DataflowEdge{o2, DataflowInput{n4, 0}}, + DataflowEdge{o1, DataflowInput{n2, 0_n}}, + DataflowEdge{o1, DataflowInput{n3, 0_n}}, + DataflowEdge{o2, DataflowInput{n4, 0_n}}, }; CHECK(result == correct); } diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc index 330628adfd..6c770a9d29 100644 --- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc +++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc @@ -11,19 +11,19 @@ TEST_SUITE(FF_TEST_SUITE) { "std::unordered_set") { DataflowGraph g = DataflowGraph::create(); - NodeAddedResult n1_added = g.add_node({}, 1); + NodeAddedResult n1_added = g.add_node({}, 1_n); Node n1 = n1_added.node; DataflowOutput o1 = get_only(n1_added.outputs); - NodeAddedResult n2_added = g.add_node({o1}, 1); + NodeAddedResult n2_added = g.add_node({o1}, 1_n); Node n2 = n2_added.node; DataflowOutput o2 = get_only(n2_added.outputs); - NodeAddedResult n3_added = g.add_node({o1, o2, o1}, 1); + NodeAddedResult n3_added = g.add_node({o1, o2, o1}, 1_n); Node n3 = n3_added.node; DataflowOutput o3 = get_only(n3_added.outputs); - NodeAddedResult n4_added = g.add_node({o2, o3}, 1); + NodeAddedResult n4_added = g.add_node({o2, o3}, 1_n); Node n4 = n4_added.node; DataflowOutput o4 = get_only(n4_added.outputs); @@ -33,9 +33,9 @@ TEST_SUITE(FF_TEST_SUITE) { get_subgraph_incoming_edges(g, input_node_set); std::unordered_set correct = { - DataflowEdge{o1, DataflowInput{n2, 0}}, - DataflowEdge{o1, DataflowInput{n3, 0}}, - DataflowEdge{o1, DataflowInput{n3, 2}}, + DataflowEdge{o1, DataflowInput{n2, 0_n}}, + DataflowEdge{o1, DataflowInput{n3, 0_n}}, + DataflowEdge{o1, DataflowInput{n3, 2_n}}, }; CHECK(result == correct); diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc index 779d0a9560..bb7f3c4c30 100644 --- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc +++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc @@ -11,19 +11,19 @@ TEST_SUITE(FF_TEST_SUITE) { "std::unordered_set") { DataflowGraph g = DataflowGraph::create(); - NodeAddedResult n1_added = g.add_node({}, 1); + NodeAddedResult n1_added = g.add_node({}, 1_n); Node n1 = n1_added.node; DataflowOutput o1 = get_only(n1_added.outputs); - NodeAddedResult n2_added = g.add_node({o1}, 1); + NodeAddedResult n2_added = g.add_node({o1}, 1_n); Node n2 = n2_added.node; DataflowOutput o2 = get_only(n2_added.outputs); - NodeAddedResult n3_added = g.add_node({o2}, 1); + NodeAddedResult n3_added = g.add_node({o2}, 1_n); Node n3 = n3_added.node; DataflowOutput o3 = get_only(n3_added.outputs); - NodeAddedResult n4_added = g.add_node({o1, o2, o3}, 1); + NodeAddedResult n4_added = g.add_node({o1, o2, o3}, 1_n); Node n4 = n4_added.node; DataflowOutput o4 = get_only(n4_added.outputs); @@ -33,8 +33,8 @@ TEST_SUITE(FF_TEST_SUITE) { get_subgraph_outgoing_edges(g, input_node_set); std::unordered_set correct = { - DataflowEdge{o2, DataflowInput{n4, 1}}, - DataflowEdge{o3, DataflowInput{n4, 2}}, + DataflowEdge{o2, DataflowInput{n4, 1_n}}, + DataflowEdge{o3, DataflowInput{n4, 2_n}}, }; CHECK(result == correct); diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_boundary_nodes_for_split.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_boundary_nodes_for_split.cc index c35789044d..4e26812315 100644 --- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_boundary_nodes_for_split.cc +++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_boundary_nodes_for_split.cc @@ -19,19 +19,19 @@ TEST_SUITE(FF_TEST_SUITE) { DataflowGraph g = DataflowGraph::create(); - NodeAddedResult n1_added = g.add_node({}, 1); + NodeAddedResult n1_added = g.add_node({}, 1_n); Node n1 = n1_added.node; DataflowOutput o1 = get_only(n1_added.outputs); - NodeAddedResult n2_added = g.add_node({o1}, 1); + NodeAddedResult n2_added = g.add_node({o1}, 1_n); Node n2 = n2_added.node; DataflowOutput o2 = get_only(n2_added.outputs); - NodeAddedResult n3_added = g.add_node({o1, o2}, 1); + NodeAddedResult n3_added = g.add_node({o1, o2}, 1_n); Node n3 = n3_added.node; DataflowOutput o3 = get_only(n3_added.outputs); - NodeAddedResult n4_added = g.add_node({o2, o3}, 1); + NodeAddedResult n4_added = g.add_node({o2, o3}, 1_n); Node n4 = n4_added.node; DataflowOutput o4 = get_only(n4_added.outputs); diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_edges_across_split.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_edges_across_split.cc index 1f8f66b932..38b722ec70 100644 --- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_edges_across_split.cc +++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_edges_across_split.cc @@ -25,19 +25,19 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_leaf = [](Node const &n) { return BinarySPDecompositionTree{n}; }; SUBCASE("multiple nodes with edges across") { - NodeAddedResult n1_added = g.add_node({}, 1); + NodeAddedResult n1_added = g.add_node({}, 1_n); Node n1 = n1_added.node; DataflowOutput o1 = get_only(n1_added.outputs); - NodeAddedResult n2_added = g.add_node({}, 1); + NodeAddedResult n2_added = g.add_node({}, 1_n); Node n2 = n2_added.node; DataflowOutput o2 = get_only(n2_added.outputs); - NodeAddedResult n3_added = g.add_node({o2, o1}, 1); + NodeAddedResult n3_added = g.add_node({o2, o1}, 1_n); Node n3 = n3_added.node; DataflowOutput o3 = get_only(n3_added.outputs); - NodeAddedResult n4_added = g.add_node({o1}, 1); + NodeAddedResult n4_added = g.add_node({o1}, 1_n); Node n4 = n4_added.node; DataflowOutput o4 = get_only(n4_added.outputs); @@ -54,15 +54,15 @@ TEST_SUITE(FF_TEST_SUITE) { std::unordered_set correct = { DataflowEdge{ o1, - DataflowInput{n3, 1}, + DataflowInput{n3, 1_n}, }, DataflowEdge{ o2, - DataflowInput{n3, 0}, + DataflowInput{n3, 0_n}, }, DataflowEdge{ o1, - DataflowInput{n4, 0}, + DataflowInput{n4, 0_n}, }, }; @@ -70,12 +70,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("nodes each have multiple edges across") { - NodeAddedResult n1_added = g.add_node({}, 2); + NodeAddedResult n1_added = g.add_node({}, 2_n); Node n1 = n1_added.node; DataflowOutput n1_o1 = n1_added.outputs.at(0); DataflowOutput n1_o2 = n1_added.outputs.at(1); - NodeAddedResult n2_added = g.add_node({n1_o1, n1_o2, n1_o1}, 1); + NodeAddedResult n2_added = g.add_node({n1_o1, n1_o2, n1_o1}, 1_n); Node n2 = n2_added.node; TransitiveReducedDataflowGraphView tr_g = @@ -91,15 +91,15 @@ TEST_SUITE(FF_TEST_SUITE) { std::unordered_set correct = { DataflowEdge{ n1_o1, - DataflowInput{n2, 0}, + DataflowInput{n2, 0_n}, }, DataflowEdge{ n1_o2, - DataflowInput{n2, 1}, + DataflowInput{n2, 1_n}, }, DataflowEdge{ n1_o1, - DataflowInput{n2, 2}, + DataflowInput{n2, 2_n}, }, }; @@ -107,19 +107,19 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("does not return edges eliminated by transitive reduction") { - NodeAddedResult n1_added = g.add_node({}, 1); + NodeAddedResult n1_added = g.add_node({}, 1_n); Node n1 = n1_added.node; DataflowOutput o1 = get_only(n1_added.outputs); - NodeAddedResult n2_added = g.add_node({o1}, 1); + NodeAddedResult n2_added = g.add_node({o1}, 1_n); Node n2 = n2_added.node; DataflowOutput o2 = get_only(n2_added.outputs); - NodeAddedResult n3_added = g.add_node({o1, o2}, 1); + NodeAddedResult n3_added = g.add_node({o1, o2}, 1_n); Node n3 = n3_added.node; DataflowOutput o3 = get_only(n3_added.outputs); - NodeAddedResult n4_added = g.add_node({o2, o3}, 1); + NodeAddedResult n4_added = g.add_node({o2, o3}, 1_n); Node n4 = n4_added.node; DataflowOutput o4 = get_only(n4_added.outputs); @@ -136,7 +136,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::unordered_set correct = { DataflowEdge{ o2, - DataflowInput{n3, 1}, + DataflowInput{n3, 1_n}, }, }; diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_outputs_across_split.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_outputs_across_split.cc index 0e77739434..f922721fde 100644 --- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_outputs_across_split.cc +++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_outputs_across_split.cc @@ -19,19 +19,19 @@ TEST_SUITE(FF_TEST_SUITE) { DataflowGraph g = DataflowGraph::create(); - NodeAddedResult n1_added = g.add_node({}, 1); + NodeAddedResult n1_added = g.add_node({}, 1_n); Node n1 = n1_added.node; DataflowOutput o1 = get_only(n1_added.outputs); - NodeAddedResult n2_added = g.add_node({o1}, 1); + NodeAddedResult n2_added = g.add_node({o1}, 1_n); Node n2 = n2_added.node; DataflowOutput o2 = get_only(n2_added.outputs); - NodeAddedResult n3_added = g.add_node({o1, o2}, 1); + NodeAddedResult n3_added = g.add_node({o1, o2}, 1_n); Node n3 = n3_added.node; DataflowOutput o3 = get_only(n3_added.outputs); - NodeAddedResult n4_added = g.add_node({o2, o3}, 1); + NodeAddedResult n4_added = g.add_node({o2, o3}, 1_n); Node n4 = n4_added.node; DataflowOutput o4 = get_only(n4_added.outputs); diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/unordered_open_dataflow_graph.cc b/lib/utils/test/src/utils/graph/dataflow_graph/unordered_open_dataflow_graph.cc index 7a3237d432..ec3ad86fe6 100644 --- a/lib/utils/test/src/utils/graph/dataflow_graph/unordered_open_dataflow_graph.cc +++ b/lib/utils/test/src/utils/graph/dataflow_graph/unordered_open_dataflow_graph.cc @@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) { REQUIRE(result == correct); } - NodeAddedResult added = g.add_node({}, 2); + NodeAddedResult added = g.add_node({}, 2_n); { std::unordered_set result = g.query_nodes(node_query_all()); @@ -54,7 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) { REQUIRE(result == correct); } - NodeAddedResult added2 = g.add_node(added.outputs, 3); + NodeAddedResult added2 = g.add_node(added.outputs, 3_n); { std::unordered_set result = g.query_nodes(node_query_all()); @@ -66,8 +66,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::unordered_set result = g.query_edges(dataflow_edge_query_all()); std::unordered_set correct = { - DataflowEdge{added.outputs.at(0), DataflowInput{added2.node, 0}}, - DataflowEdge{added.outputs.at(1), DataflowInput{added2.node, 1}}, + DataflowEdge{added.outputs.at(0), DataflowInput{added2.node, 0_n}}, + DataflowEdge{added.outputs.at(1), DataflowInput{added2.node, 1_n}}, }; REQUIRE(result == correct); } diff --git a/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_edges.cc b/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_edges.cc index 93d3d9605b..d9d91a03e9 100644 --- a/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_edges.cc +++ b/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_edges.cc @@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("add_edges(MultiDiGraph &, std::vector>)") { MultiDiGraph g = MultiDiGraph::create(); - std::vector n = add_nodes(g, 3); + std::vector n = add_nodes(g, 3_n); std::vector> input = { {n.at(0), n.at(1)}, diff --git a/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_nodes.cc b/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_nodes.cc index e41bf33d6c..e3d9ee6a29 100644 --- a/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_nodes.cc +++ b/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_nodes.cc @@ -9,7 +9,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("add_nodes(MultiDiGraph &, int)") { MultiDiGraph g = MultiDiGraph::create(); - std::unordered_set result = unordered_set_of(add_nodes(g, 3)); + std::unordered_set result = unordered_set_of(add_nodes(g, 3_n)); std::unordered_set correct = g.query_nodes(node_query_all()); CHECK(result == correct); diff --git a/lib/utils/test/src/utils/graph/multidigraph/algorithms/get_edges.cc b/lib/utils/test/src/utils/graph/multidigraph/algorithms/get_edges.cc index aef6d9baff..0dfcc8a851 100644 --- a/lib/utils/test/src/utils/graph/multidigraph/algorithms/get_edges.cc +++ b/lib/utils/test/src/utils/graph/multidigraph/algorithms/get_edges.cc @@ -11,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_edges(MultiDiGraphView)") { MultiDiGraph g = MultiDiGraph::create(); - std::vector n = add_nodes(g, 3); + std::vector n = add_nodes(g, 3_n); std::vector e = add_edges(g, { {n.at(0), n.at(1)}, diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/find_isomorphism.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/find_isomorphism.cc index 78aaa8d9fc..55b7b34e52 100644 --- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/find_isomorphism.cc +++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/find_isomorphism.cc @@ -26,12 +26,13 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("input graphs are not empty") { DataflowGraphInput g1_i1 = g1.add_input(); - NodeAddedResult g1_n1_added = g1.add_node({OpenDataflowValue{g1_i1}}, 1); + NodeAddedResult g1_n1_added = + g1.add_node({OpenDataflowValue{g1_i1}}, 1_n); Node g1_n1_node = g1_n1_added.node; DataflowOutput g1_n1_output = get_only(g1_n1_added.outputs); NodeAddedResult g1_n2_added = g1.add_node( - {OpenDataflowValue{g1_i1}, OpenDataflowValue{g1_n1_output}}, 1); + {OpenDataflowValue{g1_i1}, OpenDataflowValue{g1_n1_output}}, 1_n); Node g1_n2_node = g1_n2_added.node; SUBCASE("one graph is empty") { @@ -46,11 +47,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("input graphs are isomorphic") { DataflowGraphInput g2_i1 = g2.add_input(); NodeAddedResult g2_n1_added = - g2.add_node({OpenDataflowValue{g2_i1}}, 1); + g2.add_node({OpenDataflowValue{g2_i1}}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); NodeAddedResult g2_n2_added = g2.add_node( - {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1); + {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n); Node g2_n2_node = g2_n2_added.node; std::optional correct = @@ -75,11 +76,11 @@ TEST_SUITE(FF_TEST_SUITE) { DataflowGraphInput g2_i1 = g2.add_input(); DataflowGraphInput g2_i2 = g2.add_input(); NodeAddedResult g2_n1_added = - g2.add_node({OpenDataflowValue{g2_i1}}, 1); + g2.add_node({OpenDataflowValue{g2_i1}}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); NodeAddedResult g2_n2_added = g2.add_node( - {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1); + {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n); Node g2_n2_node = g2_n2_added.node; std::optional correct = std::nullopt; @@ -93,12 +94,12 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("input graphs are not isomorphic (different connectivity)") { DataflowGraphInput g2_i1 = g2.add_input(); NodeAddedResult g2_n1_added = - g2.add_node({OpenDataflowValue{g2_i1}}, 1); + g2.add_node({OpenDataflowValue{g2_i1}}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); NodeAddedResult g2_n2_added = g2.add_node( {OpenDataflowValue{g2_n1_output}, OpenDataflowValue{g2_n1_output}}, - 1); + 1_n); Node g2_n2_node = g2_n2_added.node; std::optional correct = std::nullopt; @@ -112,14 +113,14 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("input graphs are not isomorphic (different numbers of nodes)") { DataflowGraphInput g2_i1 = g2.add_input(); NodeAddedResult g2_n1_added = - g2.add_node({OpenDataflowValue{g2_i1}}, 1); + g2.add_node({OpenDataflowValue{g2_i1}}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); NodeAddedResult g2_n2_added = g2.add_node( - {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1); + {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n); Node g2_n2_node = g2_n2_added.node; - NodeAddedResult g2_n3_added = g2.add_node({}, 0); + NodeAddedResult g2_n3_added = g2.add_node({}, 0_n); Node g2_n3_node = g2_n3_added.node; std::optional correct = std::nullopt; diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.cc index ff75e8fe48..fd54b801ce 100644 --- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.cc +++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.cc @@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) { DataflowGraphInput i0 = g.add_input(); DataflowGraphInput i1 = g.add_input(); - NodeAddedResult n0_added = g.add_node({}, 1); + NodeAddedResult n0_added = g.add_node({}, 1_n); std::unordered_set result = get_open_dataflow_graph_inputs(g); diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_value_uses.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_value_uses.cc index 7496c3009d..c7d294a588 100644 --- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_value_uses.cc +++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_value_uses.cc @@ -18,19 +18,19 @@ TEST_SUITE(FF_TEST_SUITE) { NodeAddedResult n0_added = g.add_node( {OpenDataflowValue{i0}, OpenDataflowValue{i1}, OpenDataflowValue{i0}}, - 1); + 1_n); Node n0 = n0_added.node; DataflowOutput o0 = get_only(n0_added.outputs); NodeAddedResult n1_added = g.add_node( {OpenDataflowValue{i1}, OpenDataflowValue{o0}, OpenDataflowValue{i0}}, - 1); + 1_n); Node n1 = n1_added.node; std::unordered_set correct = { - DataflowInput{n0, 0}, - DataflowInput{n0, 2}, - DataflowInput{n1, 2}, + DataflowInput{n0, 0_n}, + DataflowInput{n0, 2_n}, + DataflowInput{n1, 2_n}, }; std::unordered_set result = @@ -45,7 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) { DataflowGraphInput i0 = g.add_input(); - NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 2); + NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 2_n); Node n0 = n0_added.node; DataflowOutput o0_0 = n0_added.outputs.at(0); DataflowOutput o0_1 = n0_added.outputs.at(1); @@ -53,16 +53,16 @@ TEST_SUITE(FF_TEST_SUITE) { NodeAddedResult n1_added = g.add_node({OpenDataflowValue{i0}, OpenDataflowValue{o0_1}, OpenDataflowValue{o0_0}}, - 1); + 1_n); Node n1 = n1_added.node; NodeAddedResult n2_added = - g.add_node({OpenDataflowValue{o0_1}, OpenDataflowValue{i0}}, 1); + g.add_node({OpenDataflowValue{o0_1}, OpenDataflowValue{i0}}, 1_n); Node n2 = n2_added.node; std::unordered_set correct = { - DataflowInput{n1, 1}, - DataflowInput{n2, 0}, + DataflowInput{n1, 1_n}, + DataflowInput{n2, 0_n}, }; std::unordered_set result = diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_unused_open_dataflow_graph_inputs.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_unused_open_dataflow_graph_inputs.cc index ddd6d74119..e1a2062865 100644 --- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_unused_open_dataflow_graph_inputs.cc +++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_unused_open_dataflow_graph_inputs.cc @@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) { DataflowGraphInput g_i2 = g.add_input(); DataflowGraphInput g_i3 = g.add_input(); - NodeAddedResult g_n1_added = g.add_node({OpenDataflowValue{g_i2}}, 1); + NodeAddedResult g_n1_added = g.add_node({OpenDataflowValue{g_i2}}, 1_n); std::unordered_set result = get_unused_open_dataflow_graph_inputs(g); @@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) { DataflowGraphInput g_i2 = g.add_input(); NodeAddedResult g_n1_added = - g.add_node({OpenDataflowValue{g_i1}, OpenDataflowValue{g_i2}}, 1); + g.add_node({OpenDataflowValue{g_i1}, OpenDataflowValue{g_i2}}, 1_n); std::unordered_set result = get_unused_open_dataflow_graph_inputs(g); diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graphs_are_isomorphic.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graphs_are_isomorphic.cc index bdb1bb4814..c53e069f68 100644 --- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graphs_are_isomorphic.cc +++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graphs_are_isomorphic.cc @@ -21,12 +21,13 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("input graphs are not empty") { DataflowGraphInput g1_i1 = g1.add_input(); - NodeAddedResult g1_n1_added = g1.add_node({OpenDataflowValue{g1_i1}}, 1); + NodeAddedResult g1_n1_added = + g1.add_node({OpenDataflowValue{g1_i1}}, 1_n); Node g1_n1_node = g1_n1_added.node; DataflowOutput g1_n1_output = get_only(g1_n1_added.outputs); NodeAddedResult g1_n2_added = g1.add_node( - {OpenDataflowValue{g1_i1}, OpenDataflowValue{g1_n1_output}}, 1); + {OpenDataflowValue{g1_i1}, OpenDataflowValue{g1_n1_output}}, 1_n); Node g1_n2_node = g1_n2_added.node; SUBCASE("one input graph is empty") { @@ -39,11 +40,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("input graphs are isomorphic") { DataflowGraphInput g2_i1 = g2.add_input(); NodeAddedResult g2_n1_added = - g2.add_node({OpenDataflowValue{g2_i1}}, 1); + g2.add_node({OpenDataflowValue{g2_i1}}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); NodeAddedResult g2_n2_added = g2.add_node( - {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1); + {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n); Node g2_n2_node = g2_n2_added.node; bool correct = true; @@ -57,11 +58,11 @@ TEST_SUITE(FF_TEST_SUITE) { DataflowGraphInput g2_i1 = g2.add_input(); DataflowGraphInput g2_i2 = g2.add_input(); NodeAddedResult g2_n1_added = - g2.add_node({OpenDataflowValue{g2_i1}}, 1); + g2.add_node({OpenDataflowValue{g2_i1}}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); NodeAddedResult g2_n2_added = g2.add_node( - {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1); + {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n); Node g2_n2_node = g2_n2_added.node; bool correct = false; @@ -73,12 +74,12 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("input graphs are not isomorphic (different connectivity)") { DataflowGraphInput g2_i1 = g2.add_input(); NodeAddedResult g2_n1_added = - g2.add_node({OpenDataflowValue{g2_i1}}, 1); + g2.add_node({OpenDataflowValue{g2_i1}}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); NodeAddedResult g2_n2_added = g2.add_node( {OpenDataflowValue{g2_n1_output}, OpenDataflowValue{g2_n1_output}}, - 1); + 1_n); Node g2_n2_node = g2_n2_added.node; bool correct = false; @@ -90,14 +91,14 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("input graphs are not isomorphic (different numbers of nodes)") { DataflowGraphInput g2_i1 = g2.add_input(); NodeAddedResult g2_n1_added = - g2.add_node({OpenDataflowValue{g2_i1}}, 1); + g2.add_node({OpenDataflowValue{g2_i1}}, 1_n); Node g2_n1_node = g2_n1_added.node; DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs); NodeAddedResult g2_n2_added = g2.add_node( - {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1); + {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n); Node g2_n2_node = g2_n2_added.node; - NodeAddedResult g2_n3_added = g2.add_node({}, 0); + NodeAddedResult g2_n3_added = g2.add_node({}, 0_n); Node g2_n3_node = g2_n3_added.node; bool correct = false; diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_input_ids.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_input_ids.cc index b565e46e67..90682cf0f0 100644 --- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_input_ids.cc +++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_input_ids.cc @@ -17,11 +17,11 @@ TEST_SUITE(FF_TEST_SUITE) { DataflowGraphInput i0 = g.add_input(); DataflowGraphInput i1 = g.add_input(); - NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1); + NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1_n); Node n0 = n0_added.node; DataflowOutput n0_output = get_only(n0_added.outputs); - NodeAddedResult n1_added = g.add_node({OpenDataflowValue{n0_output}}, 1); + NodeAddedResult n1_added = g.add_node({OpenDataflowValue{n0_output}}, 1_n); Node n1 = n1_added.node; DataflowOutput n1_output = get_only(n1_added.outputs); @@ -44,7 +44,7 @@ TEST_SUITE(FF_TEST_SUITE) { new_i0, DataflowInput{ n0, - 0, + 0_n, }, }, }, @@ -52,11 +52,11 @@ TEST_SUITE(FF_TEST_SUITE) { DataflowEdge{ DataflowOutput{ n0, - 0, + 0_n, }, DataflowInput{ n1, - 0, + 0_n, }, }, }, @@ -65,11 +65,11 @@ TEST_SUITE(FF_TEST_SUITE) { { DataflowOutput{ n0, - 0, + 0_n, }, DataflowOutput{ n1, - 0, + 0_n, }, }, }; diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_node_ids.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_node_ids.cc index 36bcd16dad..1e7ad87d88 100644 --- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_node_ids.cc +++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_node_ids.cc @@ -17,12 +17,12 @@ TEST_SUITE(FF_TEST_SUITE) { DataflowGraphInput i0 = g.add_input(); - NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1); + NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1_n); Node n0 = n0_added.node; DataflowOutput n0_output = get_only(n0_added.outputs); NodeAddedResult n1_added = - g.add_node({OpenDataflowValue{i0}, OpenDataflowValue{n0_output}}, 1); + g.add_node({OpenDataflowValue{i0}, OpenDataflowValue{n0_output}}, 1_n); Node n1 = n1_added.node; DataflowOutput n1_output = get_only(n1_added.outputs); @@ -45,7 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) { i0, DataflowInput{ new_node0, - 0, + 0_n, }, }, }, @@ -54,7 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) { i0, DataflowInput{ new_node1, - 0, + 0_n, }, }, }, @@ -62,11 +62,11 @@ TEST_SUITE(FF_TEST_SUITE) { DataflowEdge{ DataflowOutput{ new_node0, - 0, + 0_n, }, DataflowInput{ new_node1, - 1, + 1_n, }, }, }, @@ -75,11 +75,11 @@ TEST_SUITE(FF_TEST_SUITE) { { DataflowOutput{ new_node0, - 0, + 0_n, }, DataflowOutput{ new_node1, - 0, + 0_n, }, }, }; @@ -109,9 +109,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("check access to old edges") { OpenDataflowEdgeQuery query = OpenDataflowEdgeQuery{ dataflow_input_edge_query_for_edge( - DataflowInputEdge{i0, DataflowInput{n0, 0}}), + DataflowInputEdge{i0, DataflowInput{n0, 0_n}}), dataflow_edge_query_for_edge( - DataflowEdge{n0_output, DataflowInput{n1, 1}}), + DataflowEdge{n0_output, DataflowInput{n1, 1_n}}), }; std::unordered_set result_nodes = result.query_edges(query); @@ -121,12 +121,12 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("check access to new edges") { DataflowEdge new_standard_edge = DataflowEdge{ - DataflowOutput{new_node0, 0}, - DataflowInput{new_node1, 1}, + DataflowOutput{new_node0, 0_n}, + DataflowInput{new_node1, 1_n}, }; DataflowInputEdge new_input_edge = DataflowInputEdge{ i0, - DataflowInput{new_node0, 0}, + DataflowInput{new_node0, 0_n}, }; OpenDataflowEdgeQuery query = OpenDataflowEdgeQuery{ dataflow_input_edge_query_for_edge(new_input_edge), @@ -159,7 +159,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("check access to new outputs") { - DataflowOutput new_output = DataflowOutput{new_node0, 0}; + DataflowOutput new_output = DataflowOutput{new_node0, 0_n}; DataflowOutputQuery query = dataflow_output_query_for_output(new_output); diff --git a/lib/utils/test/src/utils/graph/series_parallel/parallel_reduction.cc b/lib/utils/test/src/utils/graph/series_parallel/parallel_reduction.cc index a62f528bcf..a2f818b5e9 100644 --- a/lib/utils/test/src/utils/graph/series_parallel/parallel_reduction.cc +++ b/lib/utils/test/src/utils/graph/series_parallel/parallel_reduction.cc @@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("find_parallel_reduction") { MultiDiGraph g = MultiDiGraph::create(); SUBCASE("base case") { - std::vector n = add_nodes(g, 2); + std::vector n = add_nodes(g, 2_n); std::vector e = add_edges(g, { {n.at(0), n.at(1)}, @@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("does not apply when there is only one edge") { - std::vector n = add_nodes(g, 2); + std::vector n = add_nodes(g, 2_n); std::vector e = add_edges(g, { {n.at(0), n.at(1)}, @@ -40,7 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("requires both ends be the same") { - std::vector n = add_nodes(g, 3); + std::vector n = add_nodes(g, 3_n); SUBCASE("branch out") { std::vector e = add_edges(g, { @@ -67,7 +67,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("finds one reduction when there are multiple") { - std::vector n = add_nodes(g, 2); + std::vector n = add_nodes(g, 2_n); std::vector e = add_edges(g, { {n.at(0), n.at(1)}, @@ -86,7 +86,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("in larger graph") { - std::vector n = add_nodes(g, 5); + std::vector n = add_nodes(g, 5_n); std::vector e = add_edges(g, { {n.at(0), n.at(1)}, @@ -109,7 +109,7 @@ TEST_SUITE(FF_TEST_SUITE) { MultiDiGraph g = MultiDiGraph::create(); SUBCASE("base case") { - std::vector n = add_nodes(g, 2); + std::vector n = add_nodes(g, 2_n); std::vector e = add_edges(g, { {n.at(0), n.at(1)}, @@ -142,7 +142,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("in larger graph") { - std::vector n = add_nodes(g, 5); + std::vector n = add_nodes(g, 5_n); std::vector e = add_edges(g, { {n.at(0), n.at(1)}, diff --git a/lib/utils/test/src/utils/graph/series_parallel/series_reduction.cc b/lib/utils/test/src/utils/graph/series_parallel/series_reduction.cc index c6b45ec6ce..4bb57aeb0d 100644 --- a/lib/utils/test/src/utils/graph/series_parallel/series_reduction.cc +++ b/lib/utils/test/src/utils/graph/series_parallel/series_reduction.cc @@ -12,7 +12,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_pre/post/center_node") { MultiDiGraph g = MultiDiGraph::create(); - std::vector n = add_nodes(g, 3); + std::vector n = add_nodes(g, 3_n); std::vector e = add_edges(g, { {n.at(0), n.at(1)}, @@ -42,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("find_series_reduction") { MultiDiGraph g = MultiDiGraph::create(); SUBCASE("base case") { - std::vector n = add_nodes(g, 3); + std::vector n = add_nodes(g, 3_n); std::vector e = add_edges(g, { {n.at(0), n.at(1)}, @@ -57,7 +57,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("does not find if other edges are involved with center node") { SUBCASE("duplicate edge") { - std::vector n = add_nodes(g, 3); + std::vector n = add_nodes(g, 3_n); std::vector e = add_edges(g, { {n.at(0), n.at(1)}, @@ -71,7 +71,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("misc edge") { - std::vector n = add_nodes(g, 4); + std::vector n = add_nodes(g, 4_n); std::vector e = add_edges(g, { {n.at(0), n.at(1)}, @@ -86,7 +86,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("does find if other edges are involved with non-center node") { - std::vector n = add_nodes(g, 4); + std::vector n = add_nodes(g, 4_n); SUBCASE("edge from dst") { std::vector e = add_edges(g, { @@ -107,7 +107,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("finds one reduction when there are multiple") { - std::vector n = add_nodes(g, 4); + std::vector n = add_nodes(g, 4_n); std::vector e = add_edges(g, { {n.at(0), n.at(1)}, @@ -125,7 +125,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("in larger graph") { - std::vector n = add_nodes(g, 8); + std::vector n = add_nodes(g, 8_n); std::vector e = add_edges(g, { {n.at(0), n.at(2)}, @@ -149,7 +149,7 @@ TEST_SUITE(FF_TEST_SUITE) { MultiDiGraph g = MultiDiGraph::create(); SUBCASE("base case") { - std::vector n = add_nodes(g, 3); + std::vector n = add_nodes(g, 3_n); std::vector e = add_edges(g, { {n.at(0), n.at(1)}, @@ -188,7 +188,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("in larger graph") { - std::vector n = add_nodes(g, 8); + std::vector n = add_nodes(g, 8_n); std::vector e = add_edges(g, { {n.at(0), n.at(2)}, diff --git a/lib/utils/test/src/utils/nonnegative_int/ceildiv.cc b/lib/utils/test/src/utils/nonnegative_int/ceildiv.cc new file mode 100644 index 0000000000..7ac882ff9f --- /dev/null +++ b/lib/utils/test/src/utils/nonnegative_int/ceildiv.cc @@ -0,0 +1,52 @@ +#include "utils/nonnegative_int/ceildiv.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("ceildiv(nonnegative_int, nonnegative_int)") { + SUBCASE("divides evenly") { + nonnegative_int numerator = 12_n; + nonnegative_int denominator = 3_n; + + nonnegative_int result = ceildiv(numerator, denominator); + nonnegative_int correct = 4_n; + + CHECK(result == correct); + } + + SUBCASE("does not divide evenly") { + nonnegative_int numerator = 17_n; + nonnegative_int denominator = 4_n; + + nonnegative_int result = ceildiv(numerator, denominator); + nonnegative_int correct = 5_n; + + CHECK(result == correct); + } + + SUBCASE("denominator is zero") { + nonnegative_int numerator = 15_n; + nonnegative_int denominator = 0_n; + + CHECK_THROWS(ceildiv(numerator, denominator)); + } + + SUBCASE("numerator is zero") { + nonnegative_int numerator = 0_n; + nonnegative_int denominator = 1_n; + + nonnegative_int result = ceildiv(numerator, denominator); + nonnegative_int correct = 0_n; + + CHECK(result == correct); + } + + SUBCASE("denominator and numerator are zero") { + nonnegative_int numerator = 0_n; + nonnegative_int denominator = 0_n; + + CHECK_THROWS(ceildiv(numerator, denominator)); + } + } +} diff --git a/lib/utils/test/src/utils/nonnegative_int/nonnegative_int.cc b/lib/utils/test/src/utils/nonnegative_int/nonnegative_int.cc index 73d382d830..dfde11f9bd 100644 --- a/lib/utils/test/src/utils/nonnegative_int/nonnegative_int.cc +++ b/lib/utils/test/src/utils/nonnegative_int/nonnegative_int.cc @@ -198,13 +198,89 @@ TEST_SUITE(FF_TEST_SUITE) { } } - TEST_CASE("nonnegative_int + operation") { - nonnegative_int nn_int_1a = nonnegative_int{1}; - nonnegative_int nn_int_1b = nonnegative_int{1}; - nonnegative_int nn_int_2 = nonnegative_int{2}; - SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int") { - CHECK(nn_int_1a + nn_int_1b == nn_int_2); - } + TEST_CASE("nonnegative_int::operator+(nonnegative_int)") { + nonnegative_int result = nonnegative_int{1} + nonnegative_int{2}; + nonnegative_int correct = nonnegative_int{3}; + + CHECK(result == correct); + } + + TEST_CASE("nonnegative_int::operator++() (pre-increment)") { + nonnegative_int input = nonnegative_int{1}; + + nonnegative_int result = ++input; + nonnegative_int correct = nonnegative_int{2}; + + CHECK(result == correct); + CHECK(input == correct); + } + + TEST_CASE("nonnegative_int::operator++(int) (post-increment)") { + nonnegative_int input = nonnegative_int{1}; + + nonnegative_int result = input++; + nonnegative_int correct_input = nonnegative_int{2}; + nonnegative_int correct_result = nonnegative_int{1}; + + CHECK(result == correct_result); + CHECK(input == correct_input); + } + + TEST_CASE("nonnegative_int::operator+=(nonnegative_int)") { + nonnegative_int result = nonnegative_int{1}; + result += nonnegative_int{3}; + + nonnegative_int correct = nonnegative_int{4}; + + CHECK(result == correct); + } + + TEST_CASE("nonnegative_int::operator*(nonnegative_int)") { + nonnegative_int result = nonnegative_int{2} * nonnegative_int{3}; + nonnegative_int correct = nonnegative_int{6}; + + CHECK(result == correct); + } + + TEST_CASE("nonnegative_int::operator*=(nonnegative_int)") { + nonnegative_int result = nonnegative_int{3}; + result *= nonnegative_int{6}; + + nonnegative_int correct = nonnegative_int{18}; + + CHECK(result == correct); + } + + TEST_CASE("nonnegative_int::operator/(nonnegative_int)") { + nonnegative_int result = nonnegative_int{5} / nonnegative_int{2}; + nonnegative_int correct = nonnegative_int{2}; + + CHECK(result == correct); + } + + TEST_CASE("nonnegative_int::operator/=(nonnegative_int)") { + nonnegative_int result = nonnegative_int{13}; + result /= nonnegative_int{3}; + + nonnegative_int correct = nonnegative_int{4}; + + CHECK(result == correct); + } + + TEST_CASE("nonnegative_int::operator%(nonnegative_int)") { + nonnegative_int result = nonnegative_int{5} % nonnegative_int{2}; + nonnegative_int correct = nonnegative_int{1}; + + CHECK(result == correct); + } + + TEST_CASE("nonnegative_int::operator%=(nonnegative_int)") { + nonnegative_int result = nonnegative_int{15}; + result %= nonnegative_int{4}; + + nonnegative_int correct = nonnegative_int{3}; + + CHECK(result == correct); } TEST_CASE("adl_serializer") { diff --git a/lib/utils/test/src/utils/nonnegative_int/nonnegative_range.cc b/lib/utils/test/src/utils/nonnegative_int/nonnegative_range.cc new file mode 100644 index 0000000000..db8fca295e --- /dev/null +++ b/lib/utils/test/src/utils/nonnegative_int/nonnegative_range.cc @@ -0,0 +1,42 @@ +#include "utils/nonnegative_int/nonnegative_range.h" +#include "test/utils/doctest/fmt/vector.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("nonnegative_range(nonnegative_int)") { + SUBCASE("bound is greater than zero") { + std::vector result = + nonnegative_range(nonnegative_int{3}); + std::vector correct = { + nonnegative_int{0}, + nonnegative_int{1}, + nonnegative_int{2}, + }; + + CHECK(result == correct); + } + + SUBCASE("bound is zero") { + std::vector result = + nonnegative_range(nonnegative_int{0}); + std::vector correct = {}; + + CHECK(result == correct); + } + } + + TEST_CASE("nonnegative_range(nonnegative_int, nonnegative_int, int)") { + std::vector result = nonnegative_range( + /*start=*/nonnegative_int{7}, + /*end=*/nonnegative_int{3}, + /*step=*/-2); + std::vector correct = { + nonnegative_int{7}, + nonnegative_int{5}, + }; + + CHECK(result == correct); + } +} diff --git a/lib/utils/test/src/utils/nonnegative_int/num_elements.cc b/lib/utils/test/src/utils/nonnegative_int/num_elements.cc new file mode 100644 index 0000000000..0878be0410 --- /dev/null +++ b/lib/utils/test/src/utils/nonnegative_int/num_elements.cc @@ -0,0 +1,15 @@ +#include "utils/nonnegative_int/num_elements.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("num_elements") { + std::vector input = {-1, 3, 3, 1}; + + nonnegative_int result = num_elements(input); + nonnegative_int correct = nonnegative_int{4}; + + CHECK(result == correct); + } +} diff --git a/lib/utils/test/src/utils/random_utils.cc b/lib/utils/test/src/utils/random_utils.cc index 8e7d22138f..fdc48a64dd 100644 --- a/lib/utils/test/src/utils/random_utils.cc +++ b/lib/utils/test/src/utils/random_utils.cc @@ -29,7 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("correct distribution") { auto check_probabilities = [](std::vector const &values, std::vector const &weights) { - int num_iterations = 10'000; + nonnegative_int num_iterations = 10'000_n; std::vector trials = repeat( num_iterations, [&]() { return select_random(values, weights); }); @@ -39,8 +39,8 @@ TEST_SUITE(FF_TEST_SUITE) { float expectedProbability = w / sum(weights); int num_occurrences = filter(trials, [&](int c) { return (c == v); }).size(); - float observedProbability = - static_cast(num_occurrences) / num_iterations; + float observedProbability = static_cast(num_occurrences) / + num_iterations.unwrap_nonnegative(); CHECK(observedProbability == doctest::Approx(expectedProbability).epsilon(0.01f)); } From fe339ebc140319f97049a17a81e3380269f69188 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Thu, 11 Jul 2024 14:33:42 -0700 Subject: [PATCH 27/42] test_utils refactor, local_cpu_allocator --- lib/kernels/test/src/test_replicate_kernel.cc | 2 +- lib/kernels/test/src/test_utils.cc | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 5133c4c89c..357d1958c0 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -48,7 +48,7 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") { - nonnegative_int num_replicas = 10_n; + nonnegative_int num_replicas = 2_n; TensorShape input_shape = make_tensor_shape_from_legion_dims({5_n}, DataType::FLOAT); diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index a15447446a..70cca5f2f0 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -99,7 +99,8 @@ struct CPUAccessorRContainsNonZero { T const *data_ptr = accessor.get
(); - for (size_t i = 0; i < accessor.shape.num_elements(); i++) { + int volume = accessor.shape.num_elements().unwrap_nonnegative(); + for (size_t i = 0; i < volume; i++) { if (data_ptr[i] != 0) { return true; } @@ -178,7 +179,8 @@ struct AccessorsAreEqual { T const *a_data_ptr = cpu_accessor_a.get
(); T const *b_data_ptr = cpu_accessor_b.get
(); - for (size_t i = 0; i < accessor_a.shape.num_elements(); i++) { + int volume = accessor_a.shape.num_elements().unwrap_nonnegative(); + for (size_t i = 0; i < volume; i++) { if (a_data_ptr[i] != b_data_ptr[i]) { return false; } @@ -218,7 +220,9 @@ struct CreateFilledAccessorW { GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); T *data_ptr = src_accessor.get
(); - for (size_t i = 0; i < dst_accessor.shape.num_elements(); i++) { + + int volume = dst_accessor.shape.num_elements().unwrap_nonnegative(); + for (size_t i = 0; i < volume; i++) { data_ptr[i] = unwrapped_value; } From 2e2ae131b3d2fa1a11278e5e3482ceedd47f780c Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Fri, 12 Jul 2024 12:54:48 -0700 Subject: [PATCH 28/42] test utils modification, cast, reverse, and replicate cpu kernels --- lib/kernels/src/cuda/ops/reverse_kernels.cu | 36 +++++- lib/kernels/test/src/test_cast_kernel.cc | 56 ++++++++++ lib/kernels/test/src/test_replicate_kernel.cc | 86 ++++++++++++++ lib/kernels/test/src/test_reverse_kernels.cc | 105 ++++++++++++++++++ 4 files changed, 277 insertions(+), 6 deletions(-) diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index 2c25293c36..c750819266 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -20,6 +20,29 @@ namespace FlexFlow { namespace Kernels { namespace Reverse { +// __global__ void reverse_forward_kernel(float const *in_ptr, +// float *out_ptr, +// coord_t num_out_blks, +// coord_t reverse_dim_size, +// coord_t in_blk_size) { +// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { +// coord_t blk_idx = i / (reverse_dim_size * in_blk_size); +// i = i - blk_idx * (reverse_dim_size * in_blk_size); +// coord_t reverse_dim_idx = i / in_blk_size; +// i = i - reverse_dim_idx * in_blk_size; +// coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + +// (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + +// i; +// out_ptr[i] = in_ptr[in_idx]; +// } +// } + +/* I mentioned this earlier, but I still think the reverse_forward_kernel code + is incorrect, even though it matches the code in inference/master? Whenever + I'm testing the code and printing out the output, I'm getting unexpected + outputs, and I think it's a result of modifying the loop index i in the + previous code? +*/ __global__ void reverse_forward_kernel(float const *in_ptr, float *out_ptr, coord_t num_out_blks, @@ -27,12 +50,13 @@ __global__ void reverse_forward_kernel(float const *in_ptr, coord_t in_blk_size) { CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { coord_t blk_idx = i / (reverse_dim_size * in_blk_size); - i = i - blk_idx * (reverse_dim_size * in_blk_size); - coord_t reverse_dim_idx = i / in_blk_size; - i = i - reverse_dim_idx * in_blk_size; - coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + - (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i; - out_ptr[i] = in_ptr[in_idx]; + coord_t idx_within_blk = i % (reverse_dim_size * in_blk_size); + coord_t reverse_dim_idx = idx_within_blk / in_blk_size; + coord_t in_idx = idx_within_blk % in_blk_size; + coord_t input_index = + blk_idx * (reverse_dim_size * in_blk_size) + + (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + in_idx; + out_ptr[i] = in_ptr[input_index]; } } diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 2ac27a9747..10e3ef791b 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -75,4 +75,60 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); } } + + TEST_CASE("Check Cast Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100, 100}); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({100, 100}); + + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(output_shape); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(output_shape); + + // Only calling forward kernel as backward kernel is exactly the same + SUBCASE("forward_kernel") { + auto transform = [start_val = 1.1f, + counter = 0.0f](float input) mutable -> float { + return start_val + counter++; + }; + + // Run GPU Forward Kernel + GenericTensorAccessorW input_accessor_gpu = + create_transformed_accessor_w( + input_shape, gpu_allocator, transform, false); + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), + read_only_accessor_from_write_accessor(input_accessor_gpu), + output_accessor_gpu, + DataType::FLOAT, + DataType::INT32); + std::vector result_data_gpu = + load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_gpu), + true); + + // Run CPU Forward Kernel + GenericTensorAccessorW input_accessor_cpu = + create_transformed_accessor_w( + input_shape, cpu_allocator, transform, true); + Kernels::Cast::CPU::forward_kernel( + read_only_accessor_from_write_accessor(input_accessor_cpu), + output_accessor_cpu, + DataType::FLOAT, + DataType::INT32); + std::vector result_data_cpu = + load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_cpu), + false); + + CHECK(result_data_gpu == result_data_cpu); + } + } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 357d1958c0..8bb6086543 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -110,4 +110,90 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor_cpu)); } } + + TEST_CASE("Check Replicate Forward Kernel against CPU Kernel") { + std::size_t num_replicas = 10; + + // This should be like three shapes: pre_replication, replication shape, and + // reduced shape, but things are weird cause doesn't seem to be replicating + // anything + TensorShape input_shape = + make_tensor_shape_from_legion_dims({10, num_replicas}); + TensorShape replicated_shape = + make_tensor_shape_from_legion_dims({10, num_replicas}); + TensorShape reduced_shape = + make_tensor_shape_from_legion_dims({10}); + + ManagedPerDeviceFFHandle managed_handle{}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("forward_kernel") { + // Run GPU Replicate Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + read_only_accessor_from_write_accessor( + create_random_filled_accessor_w(input_shape, gpu_allocator)); + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(replicated_shape); + + Kernels::Replicate::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + std::vector result_data_gpu = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_gpu), true); + + // Run CPU Replicate Forward Kernel + GenericTensorAccessorW input_accessor_cpu = + copy_tensor_between_memories( + input_accessor_gpu, input_shape, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(replicated_shape); + + Kernels::Replicate::CPU::forward_kernel( + read_only_accessor_from_write_accessor(input_accessor_cpu), + output_accessor_cpu); + + std::vector result_data_cpu = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_cpu), false); + + CHECK(result_data_gpu == result_data_cpu); + } + + SUBCASE("backward_kernel") { + GenericTensorAccessorR output_grad_accessor_gpu = + read_only_accessor_from_write_accessor( + create_random_filled_accessor_w(replicated_shape, gpu_allocator)); + GenericTensorAccessorW input_grad_accessor_gpu = + gpu_allocator.allocate_tensor(reduced_shape); + + Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), + input_grad_accessor_gpu, + output_grad_accessor_gpu, + num_replicas); + + std::vector result_data_gpu = load_accessor_data( + read_only_accessor_from_write_accessor(input_grad_accessor_gpu), + true); + + GenericTensorAccessorW output_grad_accessor_cpu = + copy_tensor_between_memories( + output_grad_accessor_gpu, replicated_shape, cpu_allocator); + + GenericTensorAccessorW input_grad_accessor_cpu = + cpu_allocator.allocate_tensor(reduced_shape); + + Kernels::Replicate::CPU::backward_kernel( + input_grad_accessor_cpu, + read_only_accessor_from_write_accessor(output_grad_accessor_cpu), + num_replicas); + + std::vector result_data_cpu = load_accessor_data( + read_only_accessor_from_write_accessor(input_grad_accessor_cpu), + false); + + CHECK(result_data_gpu == result_data_cpu); + } + } } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index bf23188a8f..b865792f3f 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -137,4 +137,109 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor_cpu)); } } + + TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { + std::size_t num_out_blks = 2; + std::size_t reverse_dim_size = 3; + std::size_t in_blk_size = 5; + + TensorShape input_shape = + make_tensor_shape_from_legion_dims( + {num_out_blks, reverse_dim_size, in_blk_size}); + TensorShape output_shape = input_shape; + + ManagedPerDeviceFFHandle managed_handle{}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("forward_kernel") { + auto transform = [counter = 0.0f](float val) mutable { + return counter++; + }; + + // Run GPU Cast Forward Kernel + GenericTensorAccessorW input_accessor_gpu = + create_transformed_accessor_w( + input_shape, gpu_allocator, transform, false); + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(output_shape); + + Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), + input_accessor_gpu.get_float_ptr(), + output_accessor_gpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_accessor_gpu.shape.num_elements()); + + std::vector result_data_gpu = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_gpu), true); + + // Run CPU Cast Forward Kernel + GenericTensorAccessorW input_accessor_cpu = + create_transformed_accessor_w( + input_shape, cpu_allocator, transform, true); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(output_shape); + + Kernels::Reverse::CPU::forward_kernel( + input_accessor_cpu.get_float_ptr(), + output_accessor_cpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_accessor_cpu.shape.num_elements()); + + std::vector result_data_cpu = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_cpu), false); + + CHECK(result_data_gpu == result_data_cpu); + } + + SUBCASE("backward_kernel") { + // Run GPU Cast Backward Kernel + GenericTensorAccessorW output_grad_accessor_gpu = + create_random_filled_accessor_w(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + gpu_allocator.allocate_tensor(input_shape); + + Kernels::Reverse::backward_kernel( + managed_stream.raw_stream(), + output_grad_accessor_gpu.get_float_ptr(), + input_grad_accessor_gpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_grad_accessor_gpu.shape.num_elements()); + + std::vector result_data_gpu = load_accessor_data( + read_only_accessor_from_write_accessor(input_grad_accessor_gpu), + true); + + // Run CPU Cast Backward Kernel + GenericTensorAccessorW output_grad_accessor_cpu = + copy_tensor_between_memories( + read_only_accessor_from_write_accessor(output_grad_accessor_gpu), + output_shape, + cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + cpu_allocator.allocate_tensor(input_shape); + + Kernels::Reverse::CPU::backward_kernel( + output_grad_accessor_cpu.get_float_ptr(), + input_grad_accessor_cpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_grad_accessor_cpu.shape.num_elements()); + + std::vector result_data_cpu = load_accessor_data( + read_only_accessor_from_write_accessor(input_grad_accessor_cpu), + false); + + CHECK(result_data_gpu == result_data_cpu); + } + } } From 6c30466fc3980e6c5d169ec35a89f62720ed61e2 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Sun, 14 Jul 2024 15:45:59 -0700 Subject: [PATCH 29/42] combine kernel --- lib/kernels/src/local_cuda_allocator.cc | 1 + lib/kernels/test/src/test_cast_kernel.cc | 4 ++-- lib/kernels/test/src/test_replicate_kernel.cc | 13 +++++++------ lib/kernels/test/src/test_reverse_kernels.cc | 8 ++++---- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc index 416768a479..c72020acb2 100644 --- a/lib/kernels/src/local_cuda_allocator.cc +++ b/lib/kernels/src/local_cuda_allocator.cc @@ -6,6 +6,7 @@ namespace FlexFlow { void *LocalCudaAllocator::allocate(size_t requested_memory_size) { void *ptr; checkCUDA(cudaMalloc(&ptr, requested_memory_size)); + checkCUDA(cudaMemset(ptr, 0, requested_memory_size)); this->ptrs.insert(ptr); return ptr; } diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 10e3ef791b..77d602a89d 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -112,7 +112,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_gpu = load_accessor_data( read_only_accessor_from_write_accessor(output_accessor_gpu), - true); + false); // Run CPU Forward Kernel GenericTensorAccessorW input_accessor_cpu = @@ -126,7 +126,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_cpu = load_accessor_data( read_only_accessor_from_write_accessor(output_accessor_cpu), - false); + true); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 8bb6086543..fc61458568 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -116,7 +116,7 @@ TEST_SUITE(FF_TEST_SUITE) { // This should be like three shapes: pre_replication, replication shape, and // reduced shape, but things are weird cause doesn't seem to be replicating - // anything + // anything (ie. input shape should be same as reduced shape) TensorShape input_shape = make_tensor_shape_from_legion_dims({10, num_replicas}); TensorShape replicated_shape = @@ -142,7 +142,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), true); + read_only_accessor_from_write_accessor(output_accessor_gpu), false); // Run CPU Replicate Forward Kernel GenericTensorAccessorW input_accessor_cpu = @@ -156,12 +156,13 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor_cpu); std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), false); + read_only_accessor_from_write_accessor(output_accessor_cpu), true); CHECK(result_data_gpu == result_data_cpu); } SUBCASE("backward_kernel") { + // Run GPU Replicate Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = read_only_accessor_from_write_accessor( create_random_filled_accessor_w(replicated_shape, gpu_allocator)); @@ -175,12 +176,12 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_gpu = load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - true); + false); + // Run CPU Replicate Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = copy_tensor_between_memories( output_grad_accessor_gpu, replicated_shape, cpu_allocator); - GenericTensorAccessorW input_grad_accessor_cpu = cpu_allocator.allocate_tensor(reduced_shape); @@ -191,7 +192,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_cpu = load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - false); + true); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index b865792f3f..d1c5274dc8 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -175,7 +175,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor_gpu.shape.num_elements()); std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), true); + read_only_accessor_from_write_accessor(output_accessor_gpu), false); // Run CPU Cast Forward Kernel GenericTensorAccessorW input_accessor_cpu = @@ -193,7 +193,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor_cpu.shape.num_elements()); std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), false); + read_only_accessor_from_write_accessor(output_accessor_cpu), true); CHECK(result_data_gpu == result_data_cpu); } @@ -216,7 +216,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_gpu = load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - true); + false); // Run CPU Cast Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = @@ -237,7 +237,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_cpu = load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - false); + true); CHECK(result_data_gpu == result_data_cpu); } From 5b5c591ab1b7d209775485ee69af3239bd769fa8 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Wed, 31 Jul 2024 04:49:13 -0700 Subject: [PATCH 30/42] test utils logic cleanup, reverse cpu_kernel pedagogical implmentation, other minor fixes --- lib/kernels/include/kernels/allocation.h | 6 +++ .../include/kernels/local_cpu_allocator.h | 1 + .../include/kernels/local_cuda_allocator.h | 1 + lib/kernels/src/allocation.cc | 4 ++ lib/kernels/src/local_cpu_allocator.cc | 3 +- lib/kernels/src/local_cuda_allocator.cc | 7 +++ lib/kernels/test/src/test_attention_kernel.cc | 26 +++++++---- .../test/src/test_batch_matmul_kernel.cc | 12 +++-- .../test/src/test_batch_norm_kernel.cc | 15 ++++-- lib/kernels/test/src/test_cast_kernel.cc | 25 +++++----- lib/kernels/test/src/test_dropout.cc | 6 ++- lib/kernels/test/src/test_gather_kernels.cc | 3 +- .../test/src/test_layer_norm_kernels.cc | 3 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 6 ++- lib/kernels/test/src/test_replicate_kernel.cc | 46 +++++++++---------- lib/kernels/test/src/test_reverse_kernels.cc | 38 ++++++++------- lib/kernels/test/src/test_softmax_kernel.cc | 6 ++- lib/kernels/test/src/test_split_kernel.cc | 6 ++- lib/kernels/test/src/test_transpose_kernel.cc | 3 +- .../local-execution/tracked_allocator.h | 1 + lib/local-execution/src/tracked_allocator.cc | 6 +++ 21 files changed, 138 insertions(+), 86 deletions(-) diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h index 4bf97118ce..7a8b844cf4 100644 --- a/lib/kernels/include/kernels/allocation.h +++ b/lib/kernels/include/kernels/allocation.h @@ -5,10 +5,13 @@ #include #include +enum class AllocLocation { HOST, DEVICE }; + namespace FlexFlow { struct IAllocator { virtual void *allocate(size_t) = 0; + virtual void *allocate_and_zero(size_t) = 0; virtual void deallocate(void *) = 0; virtual DeviceType get_allocation_device_type() const = 0; @@ -22,6 +25,7 @@ struct Allocator { GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape); void *allocate(size_t mem_size); + void *allocate_and_zero(size_t mem_size); void deallocate(void *ptr); DeviceType get_allocation_device_type() const; @@ -35,6 +39,8 @@ struct Allocator { Allocator(std::shared_ptr ptr) : i_allocator(ptr){}; + AllocLocation alloc_location; + private: std::shared_ptr i_allocator; }; diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h index cf6cfe35d1..c18d43683e 100644 --- a/lib/kernels/include/kernels/local_cpu_allocator.h +++ b/lib/kernels/include/kernels/local_cpu_allocator.h @@ -10,6 +10,7 @@ struct LocalCPUAllocator : public IAllocator { ~LocalCPUAllocator() = default; void *allocate(size_t) override; + void *allocate_and_zero(size_t) override; void deallocate(void *) override; DeviceType get_allocation_device_type() const override; diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h index b8e0540974..fb3a42d864 100644 --- a/lib/kernels/include/kernels/local_cuda_allocator.h +++ b/lib/kernels/include/kernels/local_cuda_allocator.h @@ -10,6 +10,7 @@ struct LocalCudaAllocator : public IAllocator { ~LocalCudaAllocator() override; void *allocate(size_t) override; + void *allocate_and_zero(size_t) override; void deallocate(void *) override; DeviceType get_allocation_device_type() const override; diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index bed8daba51..255cf4b7e3 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -7,6 +7,10 @@ void *Allocator::allocate(size_t mem_size) { return this->i_allocator->allocate(mem_size); } +void *Allocator::allocate_and_zero(size_t mem_size) { + return this->i_allocator->allocate_and_zero(mem_size); +} + void Allocator::deallocate(void *ptr) { this->i_allocator->deallocate(ptr); } diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc index 5cf337c685..adc31b2c6b 100644 --- a/lib/kernels/src/local_cpu_allocator.cc +++ b/lib/kernels/src/local_cpu_allocator.cc @@ -23,8 +23,7 @@ DeviceType LocalCPUAllocator::get_allocation_device_type() const { } Allocator create_local_cpu_memory_allocator() { - Allocator allocator = Allocator::create(); - return allocator; + return Allocator::create(); } } // namespace FlexFlow diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc index c72020acb2..666e5cae2e 100644 --- a/lib/kernels/src/local_cuda_allocator.cc +++ b/lib/kernels/src/local_cuda_allocator.cc @@ -4,6 +4,13 @@ namespace FlexFlow { void *LocalCudaAllocator::allocate(size_t requested_memory_size) { + void *ptr; + checkCUDA(cudaMalloc(&ptr, requested_memory_size)); + this->ptrs.insert(ptr); + return ptr; +} + +void *LocalCudaAllocator::allocate_and_zero(size_t requested_memory_size) { void *ptr; checkCUDA(cudaMalloc(&ptr, requested_memory_size)); checkCUDA(cudaMemset(ptr, 0, requested_memory_size)); diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index bd0167a677..ea861c7da9 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -53,13 +53,16 @@ TEST_SUITE(FF_TEST_SUITE) { {nonnegative_int{state.weightSize}}, DataType::FLOAT); GenericTensorAccessorW query_accessor = - create_random_filled_accessor_w(query_shape, allocator); + create_random_filled_accessor_w(query_shape, + allocator); GenericTensorAccessorW key_accessor = - create_random_filled_accessor_w(key_shape, allocator); + create_random_filled_accessor_w(key_shape, allocator); GenericTensorAccessorW value_accessor = - create_random_filled_accessor_w(value_shape, allocator); + create_random_filled_accessor_w(value_shape, + allocator); GenericTensorAccessorW weight_accessor = - create_random_filled_accessor_w(weight_shape, allocator); + create_random_filled_accessor_w(weight_shape, + allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -79,15 +82,20 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW query_grad_accessor = - create_random_filled_accessor_w(query_shape, allocator); + create_random_filled_accessor_w(query_shape, + allocator); GenericTensorAccessorW key_grad_accessor = - create_random_filled_accessor_w(key_shape, allocator); + create_random_filled_accessor_w(key_shape, + allocator); GenericTensorAccessorW value_grad_accessor = - create_random_filled_accessor_w(value_shape, allocator); + create_random_filled_accessor_w(value_shape, + allocator); GenericTensorAccessorW weight_grad_accessor = - create_random_filled_accessor_w(weight_shape, allocator); + create_random_filled_accessor_w(weight_shape, + allocator); GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); Kernels::MultiHeadAttention::backward_kernel( managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index d78d5daee5..63e0909b9a 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -29,11 +29,14 @@ TEST_SUITE(FF_TEST_SUITE) { make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT); GenericTensorAccessorW a_accessor = - create_random_filled_accessor_w(input_shape_a, allocator); + create_random_filled_accessor_w(input_shape_a, + allocator); GenericTensorAccessorW b_accessor = - create_random_filled_accessor_w(input_shape_b, allocator); + create_random_filled_accessor_w(input_shape_b, + allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); SUBCASE("forward_kernel") { Kernels::BatchMatmul::forward_kernel(managed_stream.raw_stream(), @@ -52,7 +55,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW o_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW a_grad_accessor = allocator.allocate_tensor(input_shape_a); GenericTensorAccessorW b_grad_accessor = diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index d0ec2559ba..79331a8539 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -39,7 +39,8 @@ TEST_SUITE(FF_TEST_SUITE) { {output_n, output_c, output_h, output_w}, DataType::FLOAT); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW scale_accessor = create_filled_accessor_w( @@ -62,13 +63,17 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW scale_grad_accessor = - create_random_filled_accessor_w(scale_shape, allocator); + create_random_filled_accessor_w(scale_shape, + allocator); GenericTensorAccessorW bias_grad_accessor = - create_random_filled_accessor_w(bias_shape, allocator); + create_random_filled_accessor_w(bias_shape, + allocator); Kernels::BatchNorm::backward_kernel( /*stream=*/managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 77d602a89d..af7f537189 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -83,9 +83,9 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::INT32); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(output_shape); @@ -102,31 +102,34 @@ TEST_SUITE(FF_TEST_SUITE) { // Run GPU Forward Kernel GenericTensorAccessorW input_accessor_gpu = create_transformed_accessor_w( - input_shape, gpu_allocator, transform, false); + input_shape, gpu_allocator, transform); Kernels::Cast::forward_kernel( managed_stream.raw_stream(), read_only_accessor_from_write_accessor(input_accessor_gpu), output_accessor_gpu, DataType::FLOAT, DataType::INT32); + std::cout << "Before GPU load" << std::endl; std::vector result_data_gpu = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), - false); + load_accessor_data(output_accessor_gpu); // Run CPU Forward Kernel GenericTensorAccessorW input_accessor_cpu = create_transformed_accessor_w( - input_shape, cpu_allocator, transform, true); - Kernels::Cast::CPU::forward_kernel( + input_shape, cpu_allocator, transform); + Kernels::Cast::cpu_forward_kernel( read_only_accessor_from_write_accessor(input_accessor_cpu), output_accessor_cpu, DataType::FLOAT, DataType::INT32); + std::cout << "Before CPU load" << std::endl; + if (output_accessor_cpu.on_device) { + std::cout << "CPU data is on device" << std::endl; + } else { + std::cout << "CPU data is on host" << std::endl; + } std::vector result_data_cpu = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), - true); + load_accessor_data(output_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index ad74fa7d36..4bcb37f083 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -47,9 +47,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_data = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW input_grad_data = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Dropout::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index b75614588c..45005092fe 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -42,7 +42,8 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Gather::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 8368fe4efd..cebf88986d 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -56,7 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW gamma_grad_accessor = allocator.allocate_tensor(feature_shape); GenericTensorAccessorW beta_grad_accessor = diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index ff74f6fb28..74d178bd64 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -55,9 +55,11 @@ TEST_SUITE(FF_TEST_SUITE) { {output_w, output_h, output_c, output_n}, DataType::FLOAT); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); SUBCASE("forward_kernel") { Kernels::Pool2D::forward_kernel(managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index fc61458568..9cd59464b9 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -118,11 +118,11 @@ TEST_SUITE(FF_TEST_SUITE) { // reduced shape, but things are weird cause doesn't seem to be replicating // anything (ie. input shape should be same as reduced shape) TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, num_replicas}); + make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT); TensorShape replicated_shape = - make_tensor_shape_from_legion_dims({10, num_replicas}); + make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT); TensorShape reduced_shape = - make_tensor_shape_from_legion_dims({10}); + make_tensor_shape_from_legion_dims({10}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -133,30 +133,30 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { // Run GPU Replicate Forward Kernel GenericTensorAccessorR input_accessor_gpu = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, gpu_allocator)); + create_random_filled_accessor_r(input_shape, + gpu_allocator); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(replicated_shape); Kernels::Replicate::forward_kernel( managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), false); + std::vector result_data_gpu = + load_accessor_data(output_accessor_gpu); // Run CPU Replicate Forward Kernel GenericTensorAccessorW input_accessor_cpu = - copy_tensor_between_memories( - input_accessor_gpu, input_shape, cpu_allocator); + copy_tensor_between_memories(input_accessor_gpu, + cpu_allocator); GenericTensorAccessorW output_accessor_cpu = cpu_allocator.allocate_tensor(replicated_shape); - Kernels::Replicate::CPU::forward_kernel( + Kernels::Replicate::cpu_forward_kernel( read_only_accessor_from_write_accessor(input_accessor_cpu), output_accessor_cpu); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), true); + std::vector result_data_cpu = + load_accessor_data(output_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } @@ -164,35 +164,33 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { // Run GPU Replicate Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(replicated_shape, gpu_allocator)); + create_random_filled_accessor_r(replicated_shape, + gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor(reduced_shape); + gpu_allocator.allocate_tensor_and_zero(reduced_shape); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), input_grad_accessor_gpu, output_grad_accessor_gpu, num_replicas); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - false); + std::vector result_data_gpu = + load_accessor_data(input_grad_accessor_gpu); // Run CPU Replicate Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = copy_tensor_between_memories( - output_grad_accessor_gpu, replicated_shape, cpu_allocator); + output_grad_accessor_gpu, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor(reduced_shape); + cpu_allocator.allocate_tensor_and_zero(reduced_shape); - Kernels::Replicate::CPU::backward_kernel( + Kernels::Replicate::cpu_backward_kernel( input_grad_accessor_cpu, read_only_accessor_from_write_accessor(output_grad_accessor_cpu), num_replicas); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - true); + std::vector result_data_cpu = + load_accessor_data(input_grad_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index d1c5274dc8..503da33984 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -43,7 +43,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -143,9 +144,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::size_t reverse_dim_size = 3; std::size_t in_blk_size = 5; - TensorShape input_shape = - make_tensor_shape_from_legion_dims( - {num_out_blks, reverse_dim_size, in_blk_size}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; ManagedPerDeviceFFHandle managed_handle{}; @@ -162,7 +162,7 @@ TEST_SUITE(FF_TEST_SUITE) { // Run GPU Cast Forward Kernel GenericTensorAccessorW input_accessor_gpu = create_transformed_accessor_w( - input_shape, gpu_allocator, transform, false); + input_shape, gpu_allocator, transform); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(output_shape); @@ -174,17 +174,17 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_accessor_gpu.shape.num_elements()); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), false); + std::vector result_data_gpu = + load_accessor_data(output_accessor_gpu); // Run CPU Cast Forward Kernel GenericTensorAccessorW input_accessor_cpu = create_transformed_accessor_w( - input_shape, cpu_allocator, transform, true); + input_shape, cpu_allocator, transform); GenericTensorAccessorW output_accessor_cpu = cpu_allocator.allocate_tensor(output_shape); - Kernels::Reverse::CPU::forward_kernel( + Kernels::Reverse::cpu_forward_kernel( input_accessor_cpu.get_float_ptr(), output_accessor_cpu.get_float_ptr(), num_out_blks, @@ -192,8 +192,8 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_accessor_cpu.shape.num_elements()); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), true); + std::vector result_data_cpu = + load_accessor_data(output_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } @@ -201,7 +201,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { // Run GPU Cast Backward Kernel GenericTensorAccessorW output_grad_accessor_gpu = - create_random_filled_accessor_w(output_shape, gpu_allocator); + create_random_filled_accessor_w(output_shape, + gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = gpu_allocator.allocate_tensor(input_shape); @@ -214,20 +215,18 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_grad_accessor_gpu.shape.num_elements()); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - false); + std::vector result_data_gpu = + load_accessor_data(input_grad_accessor_gpu); // Run CPU Cast Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = copy_tensor_between_memories( read_only_accessor_from_write_accessor(output_grad_accessor_gpu), - output_shape, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = cpu_allocator.allocate_tensor(input_shape); - Kernels::Reverse::CPU::backward_kernel( + Kernels::Reverse::cpu_backward_kernel( output_grad_accessor_cpu.get_float_ptr(), input_grad_accessor_cpu.get_float_ptr(), num_out_blks, @@ -235,9 +234,8 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_grad_accessor_cpu.shape.num_elements()); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - true); + std::vector result_data_cpu = + load_accessor_data(input_grad_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index d4fb496f7b..7e6e95daaf 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -32,11 +32,13 @@ TEST_SUITE(FF_TEST_SUITE) { input_w.unwrap_nonnegative()); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Softmax::forward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index d98f88a30e..4d3b948714 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -27,7 +27,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); std::vector output_ptrs = repeat(num_outputs, [&]() { GenericTensorAccessorW output_accessor = @@ -48,7 +49,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector output_grad_ptrs(num_outputs.unwrap_nonnegative()); for (int i = 0; i < num_outputs; i++) { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index cac43c6ff3..c35961b739 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -39,7 +39,8 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), attrs, diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h index f697337c52..31ca2475e2 100644 --- a/lib/local-execution/include/local-execution/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -12,6 +12,7 @@ struct TrackedAllocator : public IAllocator { ~TrackedAllocator() = default; void *allocate(size_t) override; + void *allocate_and_zero(size_t) override; void deallocate(void *) override; DeviceType get_allocation_device_type() const override; diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index ed181aea32..7bce6ef304 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -12,6 +12,12 @@ void *TrackedAllocator::allocate(size_t requested_memory_size) { return ptr; } +void *TrackedAllocator::allocate_and_zero(size_t requested_memory_size) { + void *ptr = this->allocator.allocate_and_zero(requested_memory_size); + this->current_mem_usage += requested_memory_size; + return ptr; +} + void TrackedAllocator::deallocate(void *ptr) { size_t psize; this->ptr_mem_usage.erase(ptr); From f0432c393d972bf262fe7153686a10b10cd279e2 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 8 Oct 2024 00:18:45 -0700 Subject: [PATCH 31/42] cpu_kernel's refactor, generic tensor accessor indexing --- lib/kernels/include/kernels/allocation.h | 6 --- .../include/kernels/batch_norm_kernels.h | 8 +-- .../include/kernels/local_cpu_allocator.h | 1 - .../include/kernels/local_cuda_allocator.h | 1 - lib/kernels/src/allocation.cc | 4 -- lib/kernels/src/local_cuda_allocator.cc | 8 --- lib/kernels/test/src/test_cast_kernel.cc | 24 +++------ lib/kernels/test/src/test_replicate_kernel.cc | 51 ++++++++---------- lib/kernels/test/src/test_reverse_kernels.cc | 53 +++++++++---------- .../local-execution/tracked_allocator.h | 1 - lib/local-execution/src/tracked_allocator.cc | 6 --- 11 files changed, 56 insertions(+), 107 deletions(-) diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h index 7a8b844cf4..4bf97118ce 100644 --- a/lib/kernels/include/kernels/allocation.h +++ b/lib/kernels/include/kernels/allocation.h @@ -5,13 +5,10 @@ #include #include -enum class AllocLocation { HOST, DEVICE }; - namespace FlexFlow { struct IAllocator { virtual void *allocate(size_t) = 0; - virtual void *allocate_and_zero(size_t) = 0; virtual void deallocate(void *) = 0; virtual DeviceType get_allocation_device_type() const = 0; @@ -25,7 +22,6 @@ struct Allocator { GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape); void *allocate(size_t mem_size); - void *allocate_and_zero(size_t mem_size); void deallocate(void *ptr); DeviceType get_allocation_device_type() const; @@ -39,8 +35,6 @@ struct Allocator { Allocator(std::shared_ptr ptr) : i_allocator(ptr){}; - AllocLocation alloc_location; - private: std::shared_ptr i_allocator; }; diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h index 4b89eb1411..26f347dd4c 100644 --- a/lib/kernels/include/kernels/batch_norm_kernels.h +++ b/lib/kernels/include/kernels/batch_norm_kernels.h @@ -7,8 +7,7 @@ #include "kernels/ff_handle.h" #include -namespace FlexFlow { -namespace Kernels::BatchNorm { +namespace ::FlexFlow::Kernels::BatchNorm; BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle, Allocator allocator, @@ -28,8 +27,6 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, BatchNormPerDeviceState const &per_device_state, - float const *input_ptr, - float *output_grad_ptr, float const *output_ptr, float *output_grad_ptr, float const *input_ptr, @@ -47,7 +44,4 @@ void cleanup_kernel(Allocator allocator, bool relu, float *runningMean); -} // namespace Kernels::BatchNorm -} // namespace FlexFlow - #endif diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h index c18d43683e..cf6cfe35d1 100644 --- a/lib/kernels/include/kernels/local_cpu_allocator.h +++ b/lib/kernels/include/kernels/local_cpu_allocator.h @@ -10,7 +10,6 @@ struct LocalCPUAllocator : public IAllocator { ~LocalCPUAllocator() = default; void *allocate(size_t) override; - void *allocate_and_zero(size_t) override; void deallocate(void *) override; DeviceType get_allocation_device_type() const override; diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h index fb3a42d864..b8e0540974 100644 --- a/lib/kernels/include/kernels/local_cuda_allocator.h +++ b/lib/kernels/include/kernels/local_cuda_allocator.h @@ -10,7 +10,6 @@ struct LocalCudaAllocator : public IAllocator { ~LocalCudaAllocator() override; void *allocate(size_t) override; - void *allocate_and_zero(size_t) override; void deallocate(void *) override; DeviceType get_allocation_device_type() const override; diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index 255cf4b7e3..bed8daba51 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -7,10 +7,6 @@ void *Allocator::allocate(size_t mem_size) { return this->i_allocator->allocate(mem_size); } -void *Allocator::allocate_and_zero(size_t mem_size) { - return this->i_allocator->allocate_and_zero(mem_size); -} - void Allocator::deallocate(void *ptr) { this->i_allocator->deallocate(ptr); } diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc index 666e5cae2e..416768a479 100644 --- a/lib/kernels/src/local_cuda_allocator.cc +++ b/lib/kernels/src/local_cuda_allocator.cc @@ -10,14 +10,6 @@ void *LocalCudaAllocator::allocate(size_t requested_memory_size) { return ptr; } -void *LocalCudaAllocator::allocate_and_zero(size_t requested_memory_size) { - void *ptr; - checkCUDA(cudaMalloc(&ptr, requested_memory_size)); - checkCUDA(cudaMemset(ptr, 0, requested_memory_size)); - this->ptrs.insert(ptr); - return ptr; -} - void LocalCudaAllocator::deallocate(void *ptr) { if (contains(this->ptrs, ptr)) { checkCUDA(cudaFree(ptr)); diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index af7f537189..1afa126870 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -94,44 +94,34 @@ TEST_SUITE(FF_TEST_SUITE) { // Only calling forward kernel as backward kernel is exactly the same SUBCASE("forward_kernel") { - auto transform = [start_val = 1.1f, - counter = 0.0f](float input) mutable -> float { - return start_val + counter++; - }; - // Run GPU Forward Kernel GenericTensorAccessorW input_accessor_gpu = - create_transformed_accessor_w( - input_shape, gpu_allocator, transform); + create_random_filled_accessor_w(input_shape, + gpu_allocator); Kernels::Cast::forward_kernel( managed_stream.raw_stream(), read_only_accessor_from_write_accessor(input_accessor_gpu), output_accessor_gpu, DataType::FLOAT, DataType::INT32); - std::cout << "Before GPU load" << std::endl; + std::vector result_data_gpu = load_accessor_data(output_accessor_gpu); // Run CPU Forward Kernel GenericTensorAccessorW input_accessor_cpu = - create_transformed_accessor_w( - input_shape, cpu_allocator, transform); + create_random_filled_accessor_w(input_shape, + cpu_allocator); Kernels::Cast::cpu_forward_kernel( read_only_accessor_from_write_accessor(input_accessor_cpu), output_accessor_cpu, DataType::FLOAT, DataType::INT32); - std::cout << "Before CPU load" << std::endl; - if (output_accessor_cpu.on_device) { - std::cout << "CPU data is on device" << std::endl; - } else { - std::cout << "CPU data is on host" << std::endl; - } + std::vector result_data_cpu = load_accessor_data(output_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 9cd59464b9..902a5a7427 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -112,17 +112,12 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("Check Replicate Forward Kernel against CPU Kernel") { - std::size_t num_replicas = 10; + std::size_t num_replicas = 2; - // This should be like three shapes: pre_replication, replication shape, and - // reduced shape, but things are weird cause doesn't seem to be replicating - // anything (ie. input shape should be same as reduced shape) TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT); - TensorShape replicated_shape = - make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT); - TensorShape reduced_shape = - make_tensor_shape_from_legion_dims({10}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({5}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -136,7 +131,8 @@ TEST_SUITE(FF_TEST_SUITE) { create_random_filled_accessor_r(input_shape, gpu_allocator); GenericTensorAccessorW output_accessor_gpu = - gpu_allocator.allocate_tensor(replicated_shape); + gpu_allocator.allocate_tensor(output_shape); + fill_with_zeros(output_accessor_gpu); Kernels::Replicate::forward_kernel( managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); @@ -145,29 +141,29 @@ TEST_SUITE(FF_TEST_SUITE) { load_accessor_data(output_accessor_gpu); // Run CPU Replicate Forward Kernel - GenericTensorAccessorW input_accessor_cpu = - copy_tensor_between_memories(input_accessor_gpu, - cpu_allocator); + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); GenericTensorAccessorW output_accessor_cpu = - cpu_allocator.allocate_tensor(replicated_shape); + cpu_allocator.allocate_tensor(output_shape); + fill_with_zeros(output_accessor_cpu); - Kernels::Replicate::cpu_forward_kernel( - read_only_accessor_from_write_accessor(input_accessor_cpu), - output_accessor_cpu); + Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); std::vector result_data_cpu = load_accessor_data(output_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } SUBCASE("backward_kernel") { // Run GPU Replicate Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = - create_random_filled_accessor_r(replicated_shape, + create_random_filled_accessor_r(output_shape, gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor_and_zero(reduced_shape); + gpu_allocator.allocate_tensor(input_shape); + fill_with_zeros(input_grad_accessor_gpu); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), input_grad_accessor_gpu, @@ -178,21 +174,20 @@ TEST_SUITE(FF_TEST_SUITE) { load_accessor_data(input_grad_accessor_gpu); // Run CPU Replicate Backward Kernel - GenericTensorAccessorW output_grad_accessor_cpu = - copy_tensor_between_memories( - output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor_and_zero(reduced_shape); + cpu_allocator.allocate_tensor(input_shape); + fill_with_zeros(input_grad_accessor_cpu); Kernels::Replicate::cpu_backward_kernel( - input_grad_accessor_cpu, - read_only_accessor_from_write_accessor(output_grad_accessor_cpu), - num_replicas); + input_grad_accessor_cpu, output_grad_accessor_cpu, num_replicas); std::vector result_data_cpu = load_accessor_data(input_grad_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } } } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 503da33984..420a449cca 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -160,11 +160,12 @@ TEST_SUITE(FF_TEST_SUITE) { }; // Run GPU Cast Forward Kernel - GenericTensorAccessorW input_accessor_gpu = - create_transformed_accessor_w( - input_shape, gpu_allocator, transform); + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, + gpu_allocator); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(output_shape); + fill_with_zeros(output_accessor_gpu); Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), input_accessor_gpu.get_float_ptr(), @@ -178,33 +179,32 @@ TEST_SUITE(FF_TEST_SUITE) { load_accessor_data(output_accessor_gpu); // Run CPU Cast Forward Kernel - GenericTensorAccessorW input_accessor_cpu = - create_transformed_accessor_w( - input_shape, cpu_allocator, transform); + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); GenericTensorAccessorW output_accessor_cpu = cpu_allocator.allocate_tensor(output_shape); + fill_with_zeros(output_accessor_cpu); - Kernels::Reverse::cpu_forward_kernel( - input_accessor_cpu.get_float_ptr(), - output_accessor_cpu.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - input_accessor_cpu.shape.num_elements()); + Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu, + num_out_blks, + reverse_dim_size, + in_blk_size); std::vector result_data_cpu = load_accessor_data(output_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } SUBCASE("backward_kernel") { // Run GPU Cast Backward Kernel - GenericTensorAccessorW output_grad_accessor_gpu = - create_random_filled_accessor_w(output_shape, + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = gpu_allocator.allocate_tensor(input_shape); + fill_with_zeros(input_grad_accessor_gpu); Kernels::Reverse::backward_kernel( managed_stream.raw_stream(), @@ -219,25 +219,22 @@ TEST_SUITE(FF_TEST_SUITE) { load_accessor_data(input_grad_accessor_gpu); // Run CPU Cast Backward Kernel - GenericTensorAccessorW output_grad_accessor_cpu = - copy_tensor_between_memories( - read_only_accessor_from_write_accessor(output_grad_accessor_gpu), - cpu_allocator); + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = cpu_allocator.allocate_tensor(input_shape); + fill_with_zeros(input_grad_accessor_cpu); - Kernels::Reverse::cpu_backward_kernel( - output_grad_accessor_cpu.get_float_ptr(), - input_grad_accessor_cpu.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - input_grad_accessor_cpu.shape.num_elements()); + Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu, + input_grad_accessor_cpu, + num_out_blks, + reverse_dim_size, + in_blk_size); std::vector result_data_cpu = load_accessor_data(input_grad_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } } } diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h index 31ca2475e2..f697337c52 100644 --- a/lib/local-execution/include/local-execution/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -12,7 +12,6 @@ struct TrackedAllocator : public IAllocator { ~TrackedAllocator() = default; void *allocate(size_t) override; - void *allocate_and_zero(size_t) override; void deallocate(void *) override; DeviceType get_allocation_device_type() const override; diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index 7bce6ef304..ed181aea32 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -12,12 +12,6 @@ void *TrackedAllocator::allocate(size_t requested_memory_size) { return ptr; } -void *TrackedAllocator::allocate_and_zero(size_t requested_memory_size) { - void *ptr = this->allocator.allocate_and_zero(requested_memory_size); - this->current_mem_usage += requested_memory_size; - return ptr; -} - void TrackedAllocator::deallocate(void *ptr) { size_t psize; this->ptr_mem_usage.erase(ptr); From 74d186d2e2648097c77b7d1bdba9a1983ddf1736 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 14 Oct 2024 23:40:12 -0700 Subject: [PATCH 32/42] test_utils refactor and clarity --- lib/kernels/src/cuda/ops/reverse_kernels.cu | 13 ++- lib/kernels/test/src/test_attention_kernel.cc | 26 ++--- .../test/src/test_batch_matmul_kernel.cc | 12 +-- .../test/src/test_batch_norm_kernel.cc | 15 +-- lib/kernels/test/src/test_cast_kernel.cc | 49 --------- lib/kernels/test/src/test_dropout.cc | 6 +- lib/kernels/test/src/test_gather_kernels.cc | 3 +- .../test/src/test_layer_norm_kernels.cc | 3 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 6 +- lib/kernels/test/src/test_replicate_kernel.cc | 80 -------------- lib/kernels/test/src/test_reverse_kernels.cc | 102 +----------------- lib/kernels/test/src/test_softmax_kernel.cc | 6 +- lib/kernels/test/src/test_split_kernel.cc | 6 +- lib/kernels/test/src/test_transpose_kernel.cc | 3 +- lib/kernels/test/src/test_utils.cc | 77 +++++++++++++ 15 files changed, 119 insertions(+), 288 deletions(-) diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index c750819266..6469dfc735 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -26,6 +26,7 @@ namespace Reverse { // coord_t reverse_dim_size, // coord_t in_blk_size) { // CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { +// coord_t out_idx = i; // coord_t blk_idx = i / (reverse_dim_size * in_blk_size); // i = i - blk_idx * (reverse_dim_size * in_blk_size); // coord_t reverse_dim_idx = i / in_blk_size; @@ -33,8 +34,18 @@ namespace Reverse { // coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + // (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + // i; -// out_ptr[i] = in_ptr[in_idx]; +// out_ptr[out_idx] = in_ptr[in_idx]; // } +// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { +// coord_t blk_idx = i / (reverse_dim_size * in_blk_size); +// i = i - blk_idx * (reverse_dim_size * in_blk_size); +// coord_t reverse_dim_idx = i / in_blk_size; +// i = i - reverse_dim_idx * in_blk_size; +// coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + +// (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + +// i; +// out_ptr[i] = in_ptr[in_idx]; +// } // } /* I mentioned this earlier, but I still think the reverse_forward_kernel code diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index ea861c7da9..bd0167a677 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -53,16 +53,13 @@ TEST_SUITE(FF_TEST_SUITE) { {nonnegative_int{state.weightSize}}, DataType::FLOAT); GenericTensorAccessorW query_accessor = - create_random_filled_accessor_w(query_shape, - allocator); + create_random_filled_accessor_w(query_shape, allocator); GenericTensorAccessorW key_accessor = - create_random_filled_accessor_w(key_shape, allocator); + create_random_filled_accessor_w(key_shape, allocator); GenericTensorAccessorW value_accessor = - create_random_filled_accessor_w(value_shape, - allocator); + create_random_filled_accessor_w(value_shape, allocator); GenericTensorAccessorW weight_accessor = - create_random_filled_accessor_w(weight_shape, - allocator); + create_random_filled_accessor_w(weight_shape, allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -82,20 +79,15 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW query_grad_accessor = - create_random_filled_accessor_w(query_shape, - allocator); + create_random_filled_accessor_w(query_shape, allocator); GenericTensorAccessorW key_grad_accessor = - create_random_filled_accessor_w(key_shape, - allocator); + create_random_filled_accessor_w(key_shape, allocator); GenericTensorAccessorW value_grad_accessor = - create_random_filled_accessor_w(value_shape, - allocator); + create_random_filled_accessor_w(value_shape, allocator); GenericTensorAccessorW weight_grad_accessor = - create_random_filled_accessor_w(weight_shape, - allocator); + create_random_filled_accessor_w(weight_shape, allocator); GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); Kernels::MultiHeadAttention::backward_kernel( managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index 63e0909b9a..d78d5daee5 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -29,14 +29,11 @@ TEST_SUITE(FF_TEST_SUITE) { make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT); GenericTensorAccessorW a_accessor = - create_random_filled_accessor_w(input_shape_a, - allocator); + create_random_filled_accessor_w(input_shape_a, allocator); GenericTensorAccessorW b_accessor = - create_random_filled_accessor_w(input_shape_b, - allocator); + create_random_filled_accessor_w(input_shape_b, allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { Kernels::BatchMatmul::forward_kernel(managed_stream.raw_stream(), @@ -55,8 +52,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW o_grad_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW a_grad_accessor = allocator.allocate_tensor(input_shape_a); GenericTensorAccessorW b_grad_accessor = diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 79331a8539..d0ec2559ba 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -39,8 +39,7 @@ TEST_SUITE(FF_TEST_SUITE) { {output_n, output_c, output_h, output_w}, DataType::FLOAT); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW scale_accessor = create_filled_accessor_w( @@ -63,17 +62,13 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW scale_grad_accessor = - create_random_filled_accessor_w(scale_shape, - allocator); + create_random_filled_accessor_w(scale_shape, allocator); GenericTensorAccessorW bias_grad_accessor = - create_random_filled_accessor_w(bias_shape, - allocator); + create_random_filled_accessor_w(bias_shape, allocator); Kernels::BatchNorm::backward_kernel( /*stream=*/managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 1afa126870..2ac27a9747 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -75,53 +75,4 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); } } - - TEST_CASE("Check Cast Forward Kernel against CPU Kernel") { - ManagedFFStream managed_stream{}; - - Allocator gpu_allocator = create_local_cuda_memory_allocator(); - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - - TensorShape input_shape = - make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); - TensorShape output_shape = - make_tensor_shape_from_legion_dims({100, 100}, DataType::INT32); - - GenericTensorAccessorW output_accessor_gpu = - gpu_allocator.allocate_tensor(output_shape); - GenericTensorAccessorW output_accessor_cpu = - cpu_allocator.allocate_tensor(output_shape); - - // Only calling forward kernel as backward kernel is exactly the same - SUBCASE("forward_kernel") { - // Run GPU Forward Kernel - GenericTensorAccessorW input_accessor_gpu = - create_random_filled_accessor_w(input_shape, - gpu_allocator); - Kernels::Cast::forward_kernel( - managed_stream.raw_stream(), - read_only_accessor_from_write_accessor(input_accessor_gpu), - output_accessor_gpu, - DataType::FLOAT, - DataType::INT32); - - std::vector result_data_gpu = - load_accessor_data(output_accessor_gpu); - - // Run CPU Forward Kernel - GenericTensorAccessorW input_accessor_cpu = - create_random_filled_accessor_w(input_shape, - cpu_allocator); - Kernels::Cast::cpu_forward_kernel( - read_only_accessor_from_write_accessor(input_accessor_cpu), - output_accessor_cpu, - DataType::FLOAT, - DataType::INT32); - - std::vector result_data_cpu = - load_accessor_data(output_accessor_cpu); - - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); - } - } } diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 4bcb37f083..ad74fa7d36 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -47,11 +47,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_data = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_data = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Dropout::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 45005092fe..b75614588c 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -42,8 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Gather::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index cebf88986d..8368fe4efd 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -56,8 +56,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW gamma_grad_accessor = allocator.allocate_tensor(feature_shape); GenericTensorAccessorW beta_grad_accessor = diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 74d178bd64..ff74f6fb28 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -55,11 +55,9 @@ TEST_SUITE(FF_TEST_SUITE) { {output_w, output_h, output_c, output_n}, DataType::FLOAT); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { Kernels::Pool2D::forward_kernel(managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 902a5a7427..357d1958c0 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -110,84 +110,4 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor_cpu)); } } - - TEST_CASE("Check Replicate Forward Kernel against CPU Kernel") { - std::size_t num_replicas = 2; - - TensorShape input_shape = - make_tensor_shape_from_legion_dims({5}, DataType::FLOAT); - TensorShape output_shape = - make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT); - - ManagedPerDeviceFFHandle managed_handle{}; - ManagedFFStream managed_stream{}; - - Allocator gpu_allocator = create_local_cuda_memory_allocator(); - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - - SUBCASE("forward_kernel") { - // Run GPU Replicate Forward Kernel - GenericTensorAccessorR input_accessor_gpu = - create_random_filled_accessor_r(input_shape, - gpu_allocator); - GenericTensorAccessorW output_accessor_gpu = - gpu_allocator.allocate_tensor(output_shape); - fill_with_zeros(output_accessor_gpu); - - Kernels::Replicate::forward_kernel( - managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); - - std::vector result_data_gpu = - load_accessor_data(output_accessor_gpu); - - // Run CPU Replicate Forward Kernel - GenericTensorAccessorR input_accessor_cpu = - copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); - GenericTensorAccessorW output_accessor_cpu = - cpu_allocator.allocate_tensor(output_shape); - fill_with_zeros(output_accessor_cpu); - - Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, - output_accessor_cpu); - - std::vector result_data_cpu = - load_accessor_data(output_accessor_cpu); - - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); - } - - SUBCASE("backward_kernel") { - // Run GPU Replicate Backward Kernel - GenericTensorAccessorR output_grad_accessor_gpu = - create_random_filled_accessor_r(output_shape, - gpu_allocator); - GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor(input_shape); - fill_with_zeros(input_grad_accessor_gpu); - - Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor_gpu, - output_grad_accessor_gpu, - num_replicas); - - std::vector result_data_gpu = - load_accessor_data(input_grad_accessor_gpu); - - // Run CPU Replicate Backward Kernel - GenericTensorAccessorR output_grad_accessor_cpu = - copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); - - GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor(input_shape); - fill_with_zeros(input_grad_accessor_cpu); - - Kernels::Replicate::cpu_backward_kernel( - input_grad_accessor_cpu, output_grad_accessor_cpu, num_replicas); - - std::vector result_data_cpu = - load_accessor_data(input_grad_accessor_cpu); - - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); - } - } } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 420a449cca..bf23188a8f 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -43,8 +43,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -138,103 +137,4 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor_cpu)); } } - - TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { - std::size_t num_out_blks = 2; - std::size_t reverse_dim_size = 3; - std::size_t in_blk_size = 5; - - TensorShape input_shape = make_tensor_shape_from_legion_dims( - {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); - TensorShape output_shape = input_shape; - - ManagedPerDeviceFFHandle managed_handle{}; - ManagedFFStream managed_stream{}; - - Allocator gpu_allocator = create_local_cuda_memory_allocator(); - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - - SUBCASE("forward_kernel") { - auto transform = [counter = 0.0f](float val) mutable { - return counter++; - }; - - // Run GPU Cast Forward Kernel - GenericTensorAccessorR input_accessor_gpu = - create_random_filled_accessor_r(input_shape, - gpu_allocator); - GenericTensorAccessorW output_accessor_gpu = - gpu_allocator.allocate_tensor(output_shape); - fill_with_zeros(output_accessor_gpu); - - Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), - input_accessor_gpu.get_float_ptr(), - output_accessor_gpu.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - input_accessor_gpu.shape.num_elements()); - - std::vector result_data_gpu = - load_accessor_data(output_accessor_gpu); - - // Run CPU Cast Forward Kernel - GenericTensorAccessorR input_accessor_cpu = - copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); - GenericTensorAccessorW output_accessor_cpu = - cpu_allocator.allocate_tensor(output_shape); - fill_with_zeros(output_accessor_cpu); - - Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu, - output_accessor_cpu, - num_out_blks, - reverse_dim_size, - in_blk_size); - - std::vector result_data_cpu = - load_accessor_data(output_accessor_cpu); - - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); - } - - SUBCASE("backward_kernel") { - // Run GPU Cast Backward Kernel - GenericTensorAccessorR output_grad_accessor_gpu = - create_random_filled_accessor_r(output_shape, - gpu_allocator); - GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor(input_shape); - fill_with_zeros(input_grad_accessor_gpu); - - Kernels::Reverse::backward_kernel( - managed_stream.raw_stream(), - output_grad_accessor_gpu.get_float_ptr(), - input_grad_accessor_gpu.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - input_grad_accessor_gpu.shape.num_elements()); - - std::vector result_data_gpu = - load_accessor_data(input_grad_accessor_gpu); - - // Run CPU Cast Backward Kernel - GenericTensorAccessorR output_grad_accessor_cpu = - copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); - GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor(input_shape); - fill_with_zeros(input_grad_accessor_cpu); - - Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu, - input_grad_accessor_cpu, - num_out_blks, - reverse_dim_size, - in_blk_size); - - std::vector result_data_cpu = - load_accessor_data(input_grad_accessor_cpu); - - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); - } - } } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index 7e6e95daaf..d4fb496f7b 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -32,13 +32,11 @@ TEST_SUITE(FF_TEST_SUITE) { input_w.unwrap_nonnegative()); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Softmax::forward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index 4d3b948714..d98f88a30e 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -27,8 +27,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); std::vector output_ptrs = repeat(num_outputs, [&]() { GenericTensorAccessorW output_accessor = @@ -49,8 +48,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector output_grad_ptrs(num_outputs.unwrap_nonnegative()); for (int i = 0; i < num_outputs; i++) { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index c35961b739..cac43c6ff3 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -39,8 +39,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), attrs, diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index 70cca5f2f0..015918b8a5 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -110,6 +110,83 @@ struct CPUAccessorRContainsNonZero { } }; +bool contains_non_zero(GenericTensorAccessorR const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + create_cpu_compatible_accessor_r(accessor, cpu_allocator); + return DataTypeDispatch1{}( + cpu_accessor.data_type, cpu_accessor); +} + +bool contains_non_zero(GenericTensorAccessorW const &accessor) { + GenericTensorAccessorR r_accessor = + read_only_accessor_from_write_accessor(accessor); + return contains_non_zero(r_accessor); +} + +GenericTensorAccessorR + create_cpu_compatible_accessor_r(GenericTensorAccessorR const &accessor, + Allocator &cpu_allocator) { + GenericTensorAccessorR cpu_accessor = accessor; + if (accessor.device_type == DeviceType::GPU) { + cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator); + } + return cpu_accessor; +} + +GenericTensorAccessorW + create_cpu_compatible_accessor_w(GenericTensorAccessorW const &accessor, + Allocator &cpu_allocator) { + GenericTensorAccessorW cpu_accessor = accessor; + if (accessor.device_type == DeviceType::GPU) { + cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator); + } + return cpu_accessor; +} + +template +struct PrintCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor) { + using T = real_type_t
; + + T const *data_ptr = accessor.get
(); + for (size_t i = 0; i < accessor.shape.num_elements(); i++) { + std::cout << data_ptr[i] << " "; + } + std::cout << "\n"; + } +}; + +void print_accessor(GenericTensorAccessorR const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + create_cpu_compatible_accessor_r(accessor, cpu_allocator); + DataTypeDispatch1{}(accessor.data_type, accessor); +} + +void print_accessor(GenericTensorAccessorW const &accessor) { + GenericTensorAccessorR r_accessor = + read_only_accessor_from_write_accessor(accessor); + print_accessor(r_accessor); +} + +template +struct CPUAccessorRContainsNonZero { + bool operator()(GenericTensorAccessorR const &accessor) { + using T = real_type_t
; + + T const *data_ptr = accessor.get
(); + + for (size_t i = 0; i < accessor.shape.num_elements(); i++) { + if (data_ptr[i] != 0) { + return true; + } + } + + return false; + } +}; + bool contains_non_zero(GenericTensorAccessorR const &accessor) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorR cpu_accessor = From f95d9da981b8c5b222c5ca983b9ae0a687b32b68 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 4 Nov 2024 23:12:02 -0800 Subject: [PATCH 33/42] R & W accessor changes, minimize code bloat --- .../test/src/test_managed_ff_stream.cc | 5 ++ .../src/test_managed_per_device_ff_handle.cc | 9 +-- lib/kernels/test/src/test_utils.cc | 77 ------------------- 3 files changed, 9 insertions(+), 82 deletions(-) diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc index 605aa6ffa1..3535dd258c 100644 --- a/lib/kernels/test/src/test_managed_ff_stream.cc +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -27,5 +27,10 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(&base_stream.raw_stream() == base_stream_ptr); } } + + SUBCASE("Test Self-Assignment") { + base_stream = std::move(base_stream); + CHECK(&base_stream.raw_stream() == base_stream_ptr); + } } } diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc index d081a0b07c..b22c683205 100644 --- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -28,14 +28,13 @@ TEST_SUITE(FF_TEST_SUITE) { /*allowTensorOpMathConversion=*/true}; new_handle = std::move(base_handle); - CHECK(&base_handle.raw_handle() == nullptr); - CHECK(&new_handle.raw_handle() == base_handle_ptr); - } + CHECK(&base_handle.raw_handle() == nullptr); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } - SUBCASE("move assign to self") { + SUBCASE("move assign to self") { base_handle = std::move(base_handle); CHECK(&base_handle.raw_handle() == base_handle_ptr); - } } } } diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index 015918b8a5..70cca5f2f0 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -110,83 +110,6 @@ struct CPUAccessorRContainsNonZero { } }; -bool contains_non_zero(GenericTensorAccessorR const &accessor) { - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorR cpu_accessor = - create_cpu_compatible_accessor_r(accessor, cpu_allocator); - return DataTypeDispatch1{}( - cpu_accessor.data_type, cpu_accessor); -} - -bool contains_non_zero(GenericTensorAccessorW const &accessor) { - GenericTensorAccessorR r_accessor = - read_only_accessor_from_write_accessor(accessor); - return contains_non_zero(r_accessor); -} - -GenericTensorAccessorR - create_cpu_compatible_accessor_r(GenericTensorAccessorR const &accessor, - Allocator &cpu_allocator) { - GenericTensorAccessorR cpu_accessor = accessor; - if (accessor.device_type == DeviceType::GPU) { - cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator); - } - return cpu_accessor; -} - -GenericTensorAccessorW - create_cpu_compatible_accessor_w(GenericTensorAccessorW const &accessor, - Allocator &cpu_allocator) { - GenericTensorAccessorW cpu_accessor = accessor; - if (accessor.device_type == DeviceType::GPU) { - cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator); - } - return cpu_accessor; -} - -template -struct PrintCPUAccessorR { - void operator()(GenericTensorAccessorR const &accessor) { - using T = real_type_t
; - - T const *data_ptr = accessor.get
(); - for (size_t i = 0; i < accessor.shape.num_elements(); i++) { - std::cout << data_ptr[i] << " "; - } - std::cout << "\n"; - } -}; - -void print_accessor(GenericTensorAccessorR const &accessor) { - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorR cpu_accessor = - create_cpu_compatible_accessor_r(accessor, cpu_allocator); - DataTypeDispatch1{}(accessor.data_type, accessor); -} - -void print_accessor(GenericTensorAccessorW const &accessor) { - GenericTensorAccessorR r_accessor = - read_only_accessor_from_write_accessor(accessor); - print_accessor(r_accessor); -} - -template -struct CPUAccessorRContainsNonZero { - bool operator()(GenericTensorAccessorR const &accessor) { - using T = real_type_t
; - - T const *data_ptr = accessor.get
(); - - for (size_t i = 0; i < accessor.shape.num_elements(); i++) { - if (data_ptr[i] != 0) { - return true; - } - } - - return false; - } -}; - bool contains_non_zero(GenericTensorAccessorR const &accessor) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorR cpu_accessor = From 8c8bc75c03412614cce37e73e3808d0859bbb178 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Thu, 21 Nov 2024 22:16:51 -0800 Subject: [PATCH 34/42] issue #1502 & issue #1540 --- .../include/kernels/loss_function_kernels.h | 2 +- lib/kernels/include/kernels/pool_2d_kernels.h | 2 +- lib/pcg/include/pcg/metric.h | 73 +++++++++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 lib/pcg/include/pcg/metric.h diff --git a/lib/kernels/include/kernels/loss_function_kernels.h b/lib/kernels/include/kernels/loss_function_kernels.h index bab404f884..9e0dbd4ba1 100644 --- a/lib/kernels/include/kernels/loss_function_kernels.h +++ b/lib/kernels/include/kernels/loss_function_kernels.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H -#include "kernels/device.h" +#include "device.h" namespace FlexFlow { diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h index 9650859a18..ad0a52efb9 100644 --- a/lib/kernels/include/kernels/pool_2d_kernels.h +++ b/lib/kernels/include/kernels/pool_2d_kernels.h @@ -67,7 +67,7 @@ void forward_kernel(ffStream_t stream, void const *input_ptr, void *output_ptr); -void backward_kernel(ffStream_t stream, +void backward_kernel(cudaStream_t stream, Pool2DPerDeviceState const &m, void const *output_ptr, void const *output_grad_ptr, diff --git a/lib/pcg/include/pcg/metric.h b/lib/pcg/include/pcg/metric.h new file mode 100644 index 0000000000..f56078772e --- /dev/null +++ b/lib/pcg/include/pcg/metric.h @@ -0,0 +1,73 @@ +#ifndef _FF_METRICS_H_ +#define _FF_METRICS_H_ + +#include +#include "utils/fmt.h" +#include "op-attrs/ops/loss_functions/loss_functions.h" + +namespace FlexFlow { + +enum class Metric { + ACCURACY, + CATEGORICAL_CROSSENTROPY, + SPARSE_CATEGORICAL_CROSSENTROPY, + MEAN_SQUARED_ERROR, + ROOT_MEAN_SQUARED_ERROR, + MEAN_ABSOLUTE_ERROR, +}; + +class MetricsAttrs { +public: + MetricsAttrs() = delete; + MetricsAttrs(LossFunction, std::vector const &); + +public: + LossFunction loss_type; + bool measure_accuracy; + bool measure_categorical_crossentropy; + bool measure_sparse_categorical_crossentropy; + bool measure_mean_squared_error; + bool measure_root_mean_squared_error; + bool measure_mean_absolute_error; +}; + +} // namespace FlexFlow + +namespace fmt { + +template <> +struct formatter<::FlexFlow::Metric> : formatter { + template + auto format(::FlexFlow::Metric m, FormatContext &ctx) const + -> decltype(ctx.out()) { + using namespace FlexFlow; + + string_view name = "unknown"; + switch (m) { + case Metric::ACCURACY: + name = "Accuracy"; + break; + case Metric::CATEGORICAL_CROSSENTROPY: + name = "CategoricalCrossEntropy"; + break; + case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: + name = "SparseCategoricalCrossEntropy"; + break; + case Metric::MEAN_SQUARED_ERROR: + name = "MeanSquaredError"; + break; + case Metric::ROOT_MEAN_SQUARED_ERROR: + name = "RootMeanSquaredError"; + break; + case Metric::MEAN_ABSOLUTE_ERROR: + name = "MeanAbsoluteError"; + break; + } + return formatter::format(name, ctx); + } +}; + +} // namespace fmt + + +#endif From c00ab840d1ae9a091368ea1440d5133b91bf0ea1 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 27 Jan 2025 20:57:10 -0800 Subject: [PATCH 35/42] branch merge and test fixes --- lib/kernels/include/kernels/loss_function_kernels.h | 2 +- lib/kernels/include/kernels/pool_2d_kernels.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/kernels/include/kernels/loss_function_kernels.h b/lib/kernels/include/kernels/loss_function_kernels.h index 9e0dbd4ba1..bab404f884 100644 --- a/lib/kernels/include/kernels/loss_function_kernels.h +++ b/lib/kernels/include/kernels/loss_function_kernels.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H -#include "device.h" +#include "kernels/device.h" namespace FlexFlow { diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h index ad0a52efb9..9650859a18 100644 --- a/lib/kernels/include/kernels/pool_2d_kernels.h +++ b/lib/kernels/include/kernels/pool_2d_kernels.h @@ -67,7 +67,7 @@ void forward_kernel(ffStream_t stream, void const *input_ptr, void *output_ptr); -void backward_kernel(cudaStream_t stream, +void backward_kernel(ffStream_t stream, Pool2DPerDeviceState const &m, void const *output_ptr, void const *output_grad_ptr, From bc4b6592346306f27665c3dc7c31c306b5b14825 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Wed, 5 Feb 2025 01:30:08 -0800 Subject: [PATCH 36/42] merge --- lib/pcg/include/pcg/metric.h | 73 ------------------- .../src/pcg/{metric.cc => metric_attrs.cc} | 0 2 files changed, 73 deletions(-) delete mode 100644 lib/pcg/include/pcg/metric.h rename lib/pcg/src/pcg/{metric.cc => metric_attrs.cc} (100%) diff --git a/lib/pcg/include/pcg/metric.h b/lib/pcg/include/pcg/metric.h deleted file mode 100644 index f56078772e..0000000000 --- a/lib/pcg/include/pcg/metric.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef _FF_METRICS_H_ -#define _FF_METRICS_H_ - -#include -#include "utils/fmt.h" -#include "op-attrs/ops/loss_functions/loss_functions.h" - -namespace FlexFlow { - -enum class Metric { - ACCURACY, - CATEGORICAL_CROSSENTROPY, - SPARSE_CATEGORICAL_CROSSENTROPY, - MEAN_SQUARED_ERROR, - ROOT_MEAN_SQUARED_ERROR, - MEAN_ABSOLUTE_ERROR, -}; - -class MetricsAttrs { -public: - MetricsAttrs() = delete; - MetricsAttrs(LossFunction, std::vector const &); - -public: - LossFunction loss_type; - bool measure_accuracy; - bool measure_categorical_crossentropy; - bool measure_sparse_categorical_crossentropy; - bool measure_mean_squared_error; - bool measure_root_mean_squared_error; - bool measure_mean_absolute_error; -}; - -} // namespace FlexFlow - -namespace fmt { - -template <> -struct formatter<::FlexFlow::Metric> : formatter { - template - auto format(::FlexFlow::Metric m, FormatContext &ctx) const - -> decltype(ctx.out()) { - using namespace FlexFlow; - - string_view name = "unknown"; - switch (m) { - case Metric::ACCURACY: - name = "Accuracy"; - break; - case Metric::CATEGORICAL_CROSSENTROPY: - name = "CategoricalCrossEntropy"; - break; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - name = "SparseCategoricalCrossEntropy"; - break; - case Metric::MEAN_SQUARED_ERROR: - name = "MeanSquaredError"; - break; - case Metric::ROOT_MEAN_SQUARED_ERROR: - name = "RootMeanSquaredError"; - break; - case Metric::MEAN_ABSOLUTE_ERROR: - name = "MeanAbsoluteError"; - break; - } - return formatter::format(name, ctx); - } -}; - -} // namespace fmt - - -#endif diff --git a/lib/pcg/src/pcg/metric.cc b/lib/pcg/src/pcg/metric_attrs.cc similarity index 100% rename from lib/pcg/src/pcg/metric.cc rename to lib/pcg/src/pcg/metric_attrs.cc From e71b6d749cadff517a7b5e35cf69581d99474125 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Wed, 5 Feb 2025 05:19:37 -0800 Subject: [PATCH 37/42] build after merge --- lib/kernels/include/kernels/accessor.h | 7 +- .../include/kernels/batch_norm_kernels.h | 3 +- lib/kernels/src/accessor.cc | 5 +- lib/kernels/src/cpu/cast_kernels.cc | 4 +- lib/kernels/src/cpu/combine_kernels.cc | 5 +- lib/kernels/src/cpu/replicate_kernels.cc | 5 +- lib/kernels/src/cpu/reverse_kernels.cc | 7 +- lib/kernels/src/cuda/embedding_kernels.cu | 214 +++++++++--------- lib/kernels/src/cuda/optimizer_kernels.cu | 6 +- lib/kernels/test/src/test_cast_kernel.cc | 4 +- lib/kernels/test/src/test_combine_kernel.cc | 2 +- lib/kernels/test/src/test_dropout.cc | 4 - .../src/test_managed_per_device_ff_handle.cc | 9 +- lib/kernels/test/src/test_reduction_kernel.cc | 1 - lib/kernels/test/src/test_replicate_kernel.cc | 12 +- lib/kernels/test/src/test_reverse_kernels.cc | 15 +- lib/kernels/test/src/test_utils.cc | 26 ++- lib/kernels/test/src/test_utils.h | 9 +- 18 files changed, 178 insertions(+), 160 deletions(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index a6fc4129e0..52ca62e217 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -17,17 +17,18 @@ inline int calculate_accessor_offset(std::vector const &indices, int multiplier = 1; for (int i = 0; i < shape.num_dims(); i++) { - if (indices.at(i) >= shape.at(legion_dim_t{i})) { + if (indices.at(i) >= shape.at(legion_dim_t{nonnegative_int{i}})) { throw mk_runtime_error( fmt::format("In {} dimension, attempting to access index {} " "when only {} indexes exist", i, indices.at(i), - shape.at(legion_dim_t{i}))); + shape.at(legion_dim_t{nonnegative_int{i}}))); } offset += indices.at(i) * multiplier; - multiplier *= shape.at(legion_dim_t{i}); + multiplier *= + shape.at(legion_dim_t{nonnegative_int{i}}).unwrap_nonnegative(); } return offset; diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h index 26f347dd4c..90202592a7 100644 --- a/lib/kernels/include/kernels/batch_norm_kernels.h +++ b/lib/kernels/include/kernels/batch_norm_kernels.h @@ -7,7 +7,7 @@ #include "kernels/ff_handle.h" #include -namespace ::FlexFlow::Kernels::BatchNorm; +namespace FlexFlow::Kernels::BatchNorm { BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle, Allocator allocator, @@ -44,4 +44,5 @@ void cleanup_kernel(Allocator allocator, bool relu, float *runningMean); +} // namespace FlexFlow::Kernels::BatchNorm #endif diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index e56bded737..1a0abec1c5 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -7,8 +7,9 @@ namespace FlexFlow { void copy_accessor_data_to_l_from_r( GenericTensorAccessorW &dst_accessor, GenericTensorAccessorR const &src_accessor) { - size_t num_bytes = dst_accessor.shape.get_volume() * - size_of_datatype(dst_accessor.data_type); + size_t num_bytes = + dst_accessor.shape.get_volume().unwrap_nonnegative() * + size_of_datatype(dst_accessor.data_type).unwrap_nonnegative(); DeviceType dst_device_type = dst_accessor.device_type; DeviceType src_device_type = src_accessor.device_type; diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc index 08f5552afc..cdd57b8947 100644 --- a/lib/kernels/src/cpu/cast_kernels.cc +++ b/lib/kernels/src/cpu/cast_kernels.cc @@ -21,7 +21,7 @@ template struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - size_t volume = input.shape.get_volume(); + size_t volume = input.shape.get_volume().unwrap_nonnegative(); cpu_cast_forward(input.get(), output.get(), volume); } }; @@ -30,7 +30,7 @@ template struct CPUBackwardKernel { void operator()(GenericTensorAccessorR const &output, GenericTensorAccessorW const &input) { - size_t volume = output.shape.get_volume(); + size_t volume = output.shape.get_volume().unwrap_nonnegative(); cpu_cast_backward( output.get(), input.get(), volume, cast_to(1.0f)); } diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc index d0be1f9f2d..577984f21a 100644 --- a/lib/kernels/src/cpu/combine_kernels.cc +++ b/lib/kernels/src/cpu/combine_kernels.cc @@ -9,7 +9,8 @@ struct CPUForwardKernel { GenericTensorAccessorW const &output) { memcpy(output.get
(), input.get
(), - input.shape.get_volume() * size_of_datatype(DT)); + input.shape.get_volume().unwrap_nonnegative() * + size_of_datatype(DT).unwrap_nonnegative()); } }; @@ -17,7 +18,7 @@ template struct CPUBackwardKernel { void operator()(GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad) { - size_t num_elements = output_grad.shape.get_volume(); + size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative(); for (int i = 0; i < num_elements; ++i) { input_grad.get
()[i] += output_grad.get
()[i]; } diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index cfcb44dac5..1e50cad4b4 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -9,7 +9,8 @@ struct CPUForwardKernel { GenericTensorAccessorW &output) { memcpy(output.get
(), input.get
(), - input.shape.num_elements() * size_of_datatype(DT)); + input.shape.num_elements().unwrap_nonnegative() * + size_of_datatype(DT).unwrap_nonnegative()); } }; @@ -19,7 +20,7 @@ struct CPUBackwardKernel { GenericTensorAccessorW &input, size_t num_replicas) { using T = real_type_t
; - for (int i = 0; i < input.shape.num_elements(); i++) { + for (int i = 0; i < input.shape.num_elements().unwrap_nonnegative(); i++) { T cur_sum = 0; for (int j = 0; j < num_replicas; j++) { cur_sum += output.at
({i, j}); diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index bc73c80e9e..848094cda7 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -11,9 +11,10 @@ struct CPUReverseForwardKernel { GenericTensorAccessorW &output) { assert(input.data_type == DT && output.data_type == DT); - int num_out_blocks = input.shape.at(legion_dim_t(0)); - int reverse_dim_size = input.shape.at(legion_dim_t(1)); - int in_block_size = input.shape.at(legion_dim_t(2)); + int num_out_blocks = input.shape.at(legion_dim_t(0_n)).unwrap_nonnegative(); + int reverse_dim_size = + input.shape.at(legion_dim_t(1_n)).unwrap_nonnegative(); + int in_block_size = input.shape.at(legion_dim_t(2_n)).unwrap_nonnegative(); for (int block_idx = 0; block_idx < num_out_blocks; block_idx++) { for (int rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu index c83e9f0a94..7ea1d3b9d3 100644 --- a/lib/kernels/src/cuda/embedding_kernels.cu +++ b/lib/kernels/src/cuda/embedding_kernels.cu @@ -342,26 +342,28 @@ struct ForwardKernel { int out_dim, int batch_size) { if (!aggr.has_value()) { - embed_forward_no_aggr<<>>(input.get(), - output.get(), - weight.get(), - out_dim, - batch_size); + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - embed_forward_with_aggr<<>>(input.get(), - output.get(), - weight.get(), - out_dim, - in_dim, - batch_size, - aggr.value()); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); } } }; @@ -377,26 +379,28 @@ struct ForwardKernel { int out_dim, int batch_size) { if (!aggr.has_value()) { - embed_forward_no_aggr<<>>(input.get(), - output.get(), - weight.get(), - out_dim, - batch_size); + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - embed_forward_with_aggr<<>>(input.get(), - output.get(), - weight.get(), - out_dim, - in_dim, - batch_size, - aggr.value()); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); } } }; @@ -412,18 +416,19 @@ struct ForwardKernel { int out_dim, int batch_size) { if (!aggr.has_value()) { - embed_forward_no_aggr<<>>(input.get(), - output.get(), - weight.get(), - out_dim, - batch_size); + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); embed_forward_with_aggr - <<>>(input.get(), @@ -448,26 +453,28 @@ struct ForwardKernel { int out_dim, int batch_size) { if (!aggr.has_value()) { - embed_forward_no_aggr<<>>(input.get(), - output.get(), - weight.get(), - out_dim, - batch_size); + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - embed_forward_with_aggr<<>>(input.get(), - output.get(), - weight.get(), - out_dim, - in_dim, - batch_size, - aggr.value()); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); } } }; @@ -483,26 +490,28 @@ struct ForwardKernel { int out_dim, int batch_size) { if (!aggr.has_value()) { - embed_forward_no_aggr<<>>(input.get(), - output.get(), - weight.get(), - out_dim, - batch_size); + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - embed_forward_with_aggr<<>>(input.get(), - output.get(), - weight.get(), - out_dim, - in_dim, - batch_size, - aggr.value()); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); } } }; @@ -518,18 +527,19 @@ struct ForwardKernel { int out_dim, int batch_size) { if (!aggr.has_value()) { - embed_forward_no_aggr<<>>(input.get(), - output.get(), - weight.get(), - out_dim, - batch_size); + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); embed_forward_with_aggr - <<>>(input.get(), @@ -570,7 +580,7 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -580,7 +590,7 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -606,7 +616,7 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -616,7 +626,7 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -642,7 +652,7 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -652,7 +662,7 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -678,7 +688,7 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -688,7 +698,7 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -714,7 +724,7 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -724,7 +734,7 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -750,7 +760,7 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -760,7 +770,7 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu index 1c6954a0b0..8aab79ba65 100644 --- a/lib/kernels/src/cuda/optimizer_kernels.cu +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -89,8 +89,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, using T = std::decay_t; if constexpr (std::is_same_v || std::is_same_v || - std::is_same_v || - std::is_same_v) { + std::is_same_v) { throw mk_runtime_error("State type does not support NCCL operations"); } else { return s.handle.ncclComm; @@ -209,8 +208,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, using T = std::decay_t; if constexpr (std::is_same_v || std::is_same_v || - std::is_same_v || - std::is_same_v) { + std::is_same_v) { throw mk_runtime_error("State type does not support NCCL operations"); } else { return s.handle.ncclComm; diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 2ac27a9747..c59d8eae3f 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -48,9 +48,9 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, 2}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({10_n, 2_n}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({10, 2}, DataType::DOUBLE); + make_tensor_shape_from_legion_dims({10_n, 2_n}, DataType::DOUBLE); // Only calling forward kernel as backward kernel is exactly the same SUBCASE("forward_kernel") { diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 91f42669eb..97fa81920b 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -50,7 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({5, 5}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({5_n, 5_n}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index ad74fa7d36..1a34c59be6 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -27,10 +27,6 @@ TEST_SUITE(FF_TEST_SUITE) { DropoutPerDeviceState state = Kernels::Dropout::init_kernel( managed_handle.raw_handle(), dropout_rate, seed, shape, allocator); - auto get_zero_count = [](std::vector const &data) { - return count(data, [](float x) { return x == 0.0f; }); - }; - SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = create_random_filled_accessor_r(input_shape, allocator); diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc index b22c683205..d081a0b07c 100644 --- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -28,13 +28,14 @@ TEST_SUITE(FF_TEST_SUITE) { /*allowTensorOpMathConversion=*/true}; new_handle = std::move(base_handle); - CHECK(&base_handle.raw_handle() == nullptr); - CHECK(&new_handle.raw_handle() == base_handle_ptr); - } + CHECK(&base_handle.raw_handle() == nullptr); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } - SUBCASE("move assign to self") { + SUBCASE("move assign to self") { base_handle = std::move(base_handle); CHECK(&base_handle.raw_handle() == base_handle_ptr); + } } } } diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 5078edee57..f91c4959cc 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -20,7 +20,6 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { TensorShape output_shape = - make_tensor_shape_from_legion_dims({10_n}, DataType::FLOAT); GenericTensorAccessorR input_accessor = diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 357d1958c0..677f1f8f5e 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -41,7 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), output_grad_accessor, input_grad_accessor, - num_replicas); + num_replicas.unwrap_nonnegative()); CHECK(contains_non_zero(input_grad_accessor)); } @@ -52,8 +52,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = make_tensor_shape_from_legion_dims({5_n}, DataType::FLOAT); - TensorShape output_shape = - make_tensor_shape_from_legion_dims({5_n, num_replicas}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {5_n, num_replicas}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{ /*workSpaceSize=*/1024 * 1024, @@ -95,7 +95,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), output_grad_accessor_gpu, input_grad_accessor_gpu, - num_replicas); + num_replicas.unwrap_nonnegative()); // Run CPU Replicate Backward Kernel GenericTensorAccessorR output_grad_accessor_cpu = @@ -104,7 +104,9 @@ TEST_SUITE(FF_TEST_SUITE) { create_zero_filled_accessor_w(input_shape, cpu_allocator); Kernels::Replicate::cpu_backward_kernel( - output_grad_accessor_cpu, input_grad_accessor_cpu, num_replicas); + output_grad_accessor_cpu, + input_grad_accessor_cpu, + num_replicas.unwrap_nonnegative()); CHECK(accessors_are_equal(input_grad_accessor_gpu, input_grad_accessor_cpu)); diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index bf23188a8f..4628cbd371 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -88,13 +88,14 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW output_accessor_gpu = create_zero_filled_accessor_w(output_shape, gpu_allocator); - Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), - input_accessor_gpu.get_float_ptr(), - output_accessor_gpu.get_float_ptr(), - num_out_blks.unwrap_nonnegative(), - reverse_dim_size.unwrap_nonnegative(), - in_blk_size.unwrap_nonnegative(), - input_accessor_gpu.shape.num_elements().unwrap_nonnegative()); + Kernels::Reverse::forward_kernel( + managed_stream.raw_stream(), + input_accessor_gpu.get_float_ptr(), + output_accessor_gpu.get_float_ptr(), + num_out_blks.unwrap_nonnegative(), + reverse_dim_size.unwrap_nonnegative(), + in_blk_size.unwrap_nonnegative(), + input_accessor_gpu.shape.num_elements().unwrap_nonnegative()); // Run CPU Cast Forward Kernel GenericTensorAccessorR input_accessor_cpu = diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index 70cca5f2f0..117c13a035 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -3,7 +3,7 @@ #include "utils/join_strings.h" #include -using namespace ::FlexFlow; +namespace FlexFlow { GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, Allocator &allocator) { @@ -12,12 +12,11 @@ GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, return result_accessor; } -TensorShape - make_tensor_shape_from_legion_dims(LegionOrdered const &dims, - DataType DT) { +TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims, + DataType DT) { return TensorShape{ TensorDims{ - ff_ordered_from_legion_ordered(dims), + dims, }, DT, }; @@ -35,7 +34,7 @@ struct CreateRandomFilledAccessorW { std::random_device rd; std::mt19937 gen(rd()); - size_t num_elements = get_num_elements(shape); + size_t num_elements = get_num_elements(shape).unwrap_nonnegative(); if constexpr (std::is_same::value) { std::bernoulli_distribution dist(0.5); for (size_t i = 0; i < num_elements; i++) { @@ -80,10 +79,14 @@ struct FillWithZeros { using T = real_type_t
; if (accessor.device_type == DeviceType::CPU) { - memset(accessor.ptr, 0, accessor.shape.get_volume() * sizeof(T)); + memset(accessor.ptr, + 0, + accessor.shape.get_volume().unwrap_nonnegative() * sizeof(T)); } else { - checkCUDA( - cudaMemset(accessor.ptr, 0, accessor.shape.get_volume() * sizeof(T))); + checkCUDA(cudaMemset(accessor.ptr, + 0, + accessor.shape.get_volume().unwrap_nonnegative() * + sizeof(T))); } } }; @@ -142,8 +145,8 @@ template struct Print2DCPUAccessorR { void operator()(GenericTensorAccessorR const &accessor, std::ostream &stream) { - int rows = accessor.shape.at(legion_dim_t{0}); - int cols = accessor.shape.at(legion_dim_t{1}); + int rows = accessor.shape.at(legion_dim_t{0_n}); + int cols = accessor.shape.at(legion_dim_t{1_n}); std::vector indices(cols); std::iota(indices.begin(), indices.end(), 0); @@ -246,3 +249,4 @@ GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, create_filled_accessor_w(shape, allocator, val); return read_only_accessor_from_write_accessor(w_accessor); } +} // namespace FlexFlow diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index a41bfc3aff..1d60562322 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -15,7 +15,7 @@ #include #include -using namespace ::FlexFlow; +namespace FlexFlow { GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, Allocator &allocator); @@ -26,9 +26,8 @@ GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, Allocator &allocator); -TensorShape - make_tensor_shape_from_legion_dims(LegionOrdered const &dims, - DataType DT); +TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims, + DataType DT); bool contains_non_zero(GenericTensorAccessorR const &accessor); @@ -65,6 +64,8 @@ std::vector repeat(std::size_t n, Func &&func) { return result; } +} // namespace FlexFlow + // Specialize doctest's StringMaker for std::vector template <> struct doctest::StringMaker> { From 311caf88033e6bfcf954f249b29a3946dd801668 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Sat, 8 Feb 2025 06:33:26 -0800 Subject: [PATCH 38/42] kernel issues --- lib/kernels/include/kernels/array_shape.h | 2 +- .../include/kernels/reverse_kernels_cpu.h | 11 +- lib/kernels/src/array_shape.cc | 26 +++- lib/kernels/src/cpu/replicate_kernels.cc | 6 +- lib/kernels/src/cpu/reverse_kernels.cc | 49 ++++---- lib/kernels/src/cuda/ops/concat_kernels.cu | 82 +++++++------ lib/kernels/src/cuda/ops/gather_kernels.cu | 19 +-- lib/kernels/src/cuda/ops/replicate_kernels.cu | 7 +- lib/kernels/src/cuda/ops/reverse_kernels.cu | 48 ++------ lib/kernels/src/legion_dim.cc | 4 +- lib/kernels/test/src/test_concat_kernel.cc | 113 +++++++++++++----- lib/kernels/test/src/test_gather_kernels.cc | 99 ++++++++++----- lib/kernels/test/src/test_replicate_kernel.cc | 2 +- lib/kernels/test/src/test_reverse_kernels.cc | 29 +++-- lib/kernels/test/src/test_utils.cc | 41 ++++--- lib/kernels/test/src/test_utils.h | 6 +- .../utils/nonnegative_int/nonnegative_int.h | 3 + .../utils/nonnegative_int/nonnegative_int.cc | 9 ++ 18 files changed, 343 insertions(+), 213 deletions(-) diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 57498ee466..72c746b8cc 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -15,7 +15,7 @@ namespace FlexFlow { struct ArrayShape { public: ArrayShape() = delete; - ArrayShape(nonnegative_int *dims, nonnegative_int num_dims); + ArrayShape(nonnegative_int const *dims, nonnegative_int num_dims); ArrayShape(TensorShape const &shape); ArrayShape(std::vector const &); diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h index 35af06aafb..e482557f93 100644 --- a/lib/kernels/include/kernels/reverse_kernels_cpu.h +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -7,10 +7,17 @@ namespace FlexFlow::Kernels::Reverse { void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, - GenericTensorAccessorW &output_accessor); + GenericTensorAccessorW &output_accessor, + int num_out_blks, + int reverse_dim_size, + int in_blk_size); void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, - GenericTensorAccessorW &input_accessor); + GenericTensorAccessorW &input_accessor, + int num_out_blks, + int reverse_dim_size, + int in_blk_size); + } // namespace FlexFlow::Kernels::Reverse #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 54534f2ccf..30db65cc03 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -11,7 +11,7 @@ static LegionOrdered return LegionOrdered{reversed(vector_of(ff_ordered))}; } -ArrayShape::ArrayShape(nonnegative_int *_dims, nonnegative_int num_dims) +ArrayShape::ArrayShape(nonnegative_int const *_dims, nonnegative_int num_dims) : dims(_dims, _dims + num_dims.unwrap_nonnegative()) {} ArrayShape::ArrayShape(TensorShape const &shape) @@ -63,7 +63,29 @@ ArrayShape ArrayShape::sub_shape( std::optional> start, std::optional> end) const { - NOT_IMPLEMENTED(); + nonnegative_int num_dims = this->num_dims(); + + auto to_legion_index = [num_dims](auto arg) -> nonnegative_int { + using T = std::decay_t; + if constexpr (std::is_same_v) { + return legion_dim_from_ff_dim(arg, num_dims).value; + } else { + return arg.value; + } + }; + + nonnegative_int start_idx = + (start.has_value()) ? std::visit(to_legion_index, start.value()) : 0_n; + + nonnegative_int end_idx = + (end.has_value()) ? std::visit(to_legion_index, end.value()) : num_dims; + + if (start_idx > num_dims || end_idx > num_dims || start_idx > end_idx) { + throw mk_runtime_error(fmt::format( + "Invalid sub_shape range: start={}, end={}", start_idx, end_idx)); + } + + return ArrayShape(&this->dims[legion_dim_t{start_idx}], end_idx - start_idx); } std::optional ArrayShape::at_maybe(legion_dim_t index) const { diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index 1e50cad4b4..cdb030d2ff 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -18,9 +18,10 @@ template struct CPUBackwardKernel { void operator()(GenericTensorAccessorR const &output, GenericTensorAccessorW &input, + size_t num_elements, size_t num_replicas) { using T = real_type_t
; - for (int i = 0; i < input.shape.num_elements().unwrap_nonnegative(); i++) { + for (int i = 0; i < num_elements; i++) { T cur_sum = 0; for (int j = 0; j < num_replicas; j++) { cur_sum += output.at
({i, j}); @@ -38,8 +39,9 @@ void cpu_forward_kernel(GenericTensorAccessorR const &input, void cpu_backward_kernel(GenericTensorAccessorR const &output, GenericTensorAccessorW &input, size_t num_replicas) { + size_t num_elements = input.shape.num_elements().unwrap_nonnegative(); DataTypeDispatch1{}( - input.data_type, output, input, num_replicas); + input.data_type, output, input, num_elements, num_replicas); } } // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index 848094cda7..e259d059ff 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -1,6 +1,5 @@ #include "kernels/datatype_dispatch.h" #include "kernels/reverse_kernels_cpu.h" -#include #include namespace FlexFlow::Kernels::Reverse { @@ -8,21 +7,15 @@ namespace FlexFlow::Kernels::Reverse { template struct CPUReverseForwardKernel { void operator()(GenericTensorAccessorR const &input, - GenericTensorAccessorW &output) { - assert(input.data_type == DT && output.data_type == DT); - - int num_out_blocks = input.shape.at(legion_dim_t(0_n)).unwrap_nonnegative(); - int reverse_dim_size = - input.shape.at(legion_dim_t(1_n)).unwrap_nonnegative(); - int in_block_size = input.shape.at(legion_dim_t(2_n)).unwrap_nonnegative(); - - for (int block_idx = 0; block_idx < num_out_blocks; block_idx++) { + GenericTensorAccessorW &output, + int num_out_blks, + int reverse_dim_size, + int in_blk_size) { + for (int blk_idx = 0; blk_idx < num_out_blks; blk_idx++) { for (int rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { - for (int i = 0; i < in_block_size; i++) { - output.at
({block_idx, rev_idx, i}) = - input.at
({num_out_blocks - 1 - block_idx, - reverse_dim_size - 1 - rev_idx, - in_block_size - 1 - i}); + for (int inner_idx = 0; inner_idx < in_blk_size; inner_idx++) { + output.at
({inner_idx, rev_idx, blk_idx}) = input.at
( + {inner_idx, reverse_dim_size - 1 - rev_idx, blk_idx}); } } } @@ -30,15 +23,29 @@ struct CPUReverseForwardKernel { }; void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, - GenericTensorAccessorW &output_accessor) { - DataTypeDispatch1{}( - input_accessor.data_type, input_accessor, output_accessor); + GenericTensorAccessorW &output_accessor, + int num_out_blks, + int reverse_dim_size, + int in_blk_size) { + DataTypeDispatch1{}(input_accessor.data_type, + input_accessor, + output_accessor, + num_out_blks, + reverse_dim_size, + in_blk_size); } void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, - GenericTensorAccessorW &input_accessor) { - DataTypeDispatch1{}( - output_accessor.data_type, output_accessor, input_accessor); + GenericTensorAccessorW &input_accessor, + int num_out_blks, + int reverse_dim_size, + int in_blk_size) { + DataTypeDispatch1{}(output_accessor.data_type, + output_accessor, + input_accessor, + num_out_blks, + reverse_dim_size, + in_blk_size); } } // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu index aa442f5c3d..683dbbaac5 100644 --- a/lib/kernels/src/cuda/ops/concat_kernels.cu +++ b/lib/kernels/src/cuda/ops/concat_kernels.cu @@ -23,38 +23,48 @@ void calc_blk_size(size_t &num_blocks, size_t &blk_size, ArrayShape const &shape, ff_dim_t axis) { - blk_size = shape.sub_shape(legion_dim_t{0_n}, axis) + legion_dim_t legion_axis = (legion_dim_from_ff_dim(axis, shape.num_dims())); + assert(legion_axis.value < shape.num_dims()); + if (legion_axis.value == 0_n) { + legion_axis.value = 1_n; + } + blk_size = shape.sub_shape(legion_dim_t{0_n}, legion_axis) .num_elements() .unwrap_nonnegative(); - num_blocks = - shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative(); + num_blocks = shape.sub_shape(legion_axis, std::nullopt) + .num_elements() + .unwrap_nonnegative(); } void forward_kernel(cudaStream_t stream, GenericTensorAccessorW const &output, std::vector const &inputs, ff_dim_t axis) { - size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; - int num_inputs = inputs.size(); - assert(num_inputs <= MAX_NUM_INPUTS); + assert(inputs.size() <= MAX_NUM_INPUTS); + size_t num_blocks = 1, output_blk_size = 1; calc_blk_size(num_blocks, output_blk_size, output.shape, axis); - for (int i = 0; i < num_inputs; i++) { - size_t input_num_blocks = 1; - calc_blk_size(input_num_blocks, input_blk_sizes[i], inputs[i].shape, axis); - assert(input_num_blocks == num_blocks); - } - off_t offset = 0; - for (int i = 0; i < num_inputs; i++) { - copy_with_stride<<>>(output.get_float_ptr() + offset, - inputs[i].get_float_ptr(), - num_blocks, + input.get_float_ptr(), + blocks_to_copy, output_blk_size, - input_blk_sizes[i]); - offset += input_blk_sizes[i]; + input_blk_size); + + offset += (output_blk_size == input_blk_size) + ? input_blk_size * input_num_blocks + : input_blk_size; } } @@ -62,29 +72,31 @@ void backward_kernel(cudaStream_t stream, GenericTensorAccessorR const &output_grad, std::vector const &input_grads, ff_dim_t axis) { - size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; - int num_inputs = input_grads.size(); - assert(num_inputs <= MAX_NUM_INPUTS); - + assert(input_grads.size() <= MAX_NUM_INPUTS); + size_t num_blocks = 1, output_blk_size = 1; calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis); - for (int i = 0; i < num_inputs; i++) { - ArrayShape shape = input_grads[i].shape; - size_t input_num_blocks = 1; - calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, axis); - assert(input_num_blocks == num_blocks); - } - off_t offset = 0; - for (int i = 0; i < num_inputs; i++) { - add_with_stride<<>>(input_grads[i].get_float_ptr(), + stream>>>(input_grad.get_float_ptr(), output_grad.get_float_ptr() + offset, - num_blocks, - input_blk_sizes[i], + blocks_to_add, + input_blk_size, output_blk_size); - offset += input_blk_sizes[i]; + + offset += (output_blk_size == input_blk_size) + ? input_blk_size * input_num_blocks + : input_blk_size; } } diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu index 31c1bac217..99034089b5 100644 --- a/lib/kernels/src/cuda/ops/gather_kernels.cu +++ b/lib/kernels/src/cuda/ops/gather_kernels.cu @@ -18,9 +18,7 @@ #include "kernels/device.h" #include "kernels/gather_kernels.h" -namespace FlexFlow { -namespace Kernels { -namespace Gather { +namespace FlexFlow::Kernels::Gather { template __global__ void gather_forward(float const *input, @@ -125,11 +123,14 @@ void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &index, GenericTensorAccessorW const &output) { checkCUDA(get_legion_stream(&stream)); - coord_t stride = output.shape.sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1)) .num_elements() .unwrap_nonnegative(); + if (m.legion_dim.value == 0_n) { + stride = 1; + } + coord_t output_dim_size = output.shape.at(m.legion_dim).unwrap_nonnegative(); coord_t input_dim_size = input.shape.at(m.legion_dim).unwrap_nonnegative(); @@ -158,8 +159,12 @@ void backward_kernel(ffStream_t stream, coord_t stride = output_grad.shape .sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1)) - .get_volume() + .num_elements() .unwrap_nonnegative(); + if (m.legion_dim.value == 0_n) { + stride = 1; + } + coord_t output_dim_size = output_grad.shape.at(m.legion_dim).unwrap_nonnegative(); coord_t input_dim_size = @@ -180,6 +185,4 @@ void backward_kernel(ffStream_t stream, output_dim_size); } -} // namespace Gather -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Gather diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu index b4fa5edb89..78022e869b 100644 --- a/lib/kernels/src/cuda/ops/replicate_kernels.cu +++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu @@ -22,8 +22,8 @@ namespace Kernels { namespace Replicate { template -__global__ void replicate_backward_kernel(T *input_ptr, - T const *output_ptr, +__global__ void replicate_backward_kernel(T const *output_ptr, + T *input_ptr, size_t num_elements, size_t num_replicas) { CUDA_KERNEL_LOOP(i, num_elements) { @@ -38,7 +38,6 @@ struct ForwardKernel { void operator()(cudaStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - checkCUDA(cudaMemcpyAsync((void *)output.get(), (void *)input.get(), input.shape.num_elements().unwrap_nonnegative() * @@ -58,8 +57,8 @@ struct BackwardKernel { input.shape.num_elements().unwrap_nonnegative() * num_replicas; replicate_backward_kernel> <<>>( - input.get(), output.get(), + input.get(), input.shape.num_elements().unwrap_nonnegative(), num_replicas); } diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index 6469dfc735..367e337b18 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -20,54 +20,20 @@ namespace FlexFlow { namespace Kernels { namespace Reverse { -// __global__ void reverse_forward_kernel(float const *in_ptr, -// float *out_ptr, -// coord_t num_out_blks, -// coord_t reverse_dim_size, -// coord_t in_blk_size) { -// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { -// coord_t out_idx = i; -// coord_t blk_idx = i / (reverse_dim_size * in_blk_size); -// i = i - blk_idx * (reverse_dim_size * in_blk_size); -// coord_t reverse_dim_idx = i / in_blk_size; -// i = i - reverse_dim_idx * in_blk_size; -// coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + -// (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + -// i; -// out_ptr[out_idx] = in_ptr[in_idx]; -// } -// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { -// coord_t blk_idx = i / (reverse_dim_size * in_blk_size); -// i = i - blk_idx * (reverse_dim_size * in_blk_size); -// coord_t reverse_dim_idx = i / in_blk_size; -// i = i - reverse_dim_idx * in_blk_size; -// coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + -// (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + -// i; -// out_ptr[i] = in_ptr[in_idx]; -// } -// } - -/* I mentioned this earlier, but I still think the reverse_forward_kernel code - is incorrect, even though it matches the code in inference/master? Whenever - I'm testing the code and printing out the output, I'm getting unexpected - outputs, and I think it's a result of modifying the loop index i in the - previous code? -*/ __global__ void reverse_forward_kernel(float const *in_ptr, float *out_ptr, coord_t num_out_blks, coord_t reverse_dim_size, coord_t in_blk_size) { CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { + coord_t out_idx = i; coord_t blk_idx = i / (reverse_dim_size * in_blk_size); - coord_t idx_within_blk = i % (reverse_dim_size * in_blk_size); - coord_t reverse_dim_idx = idx_within_blk / in_blk_size; - coord_t in_idx = idx_within_blk % in_blk_size; - coord_t input_index = - blk_idx * (reverse_dim_size * in_blk_size) + - (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + in_idx; - out_ptr[i] = in_ptr[input_index]; + i = i - blk_idx * (reverse_dim_size * in_blk_size); + coord_t reverse_dim_idx = i / in_blk_size; + i = i - reverse_dim_idx * in_blk_size; + coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + + (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i; + out_ptr[out_idx] = in_ptr[in_idx]; } } diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc index bbb15c5636..4e7fc56848 100644 --- a/lib/kernels/src/legion_dim.cc +++ b/lib/kernels/src/legion_dim.cc @@ -9,8 +9,8 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) { legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, nonnegative_int num_dimensions) { - return legion_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() - - ff_dim.value.unwrap_nonnegative() - 1}}; + return legion_dim_t{num_dimensions - ff_dim.value - 1_n}; + ; } } // namespace FlexFlow diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 3587cecedd..22da72912a 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -6,49 +6,96 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test concat kernel forward and backward") { - nonnegative_int num_inputs = 2_n; - nonnegative_int size_per_input = 10_n; - ff_dim_t concat_axis = ff_dim_t{1_n}; - ManagedPerDeviceFFHandle managed_handle{ /*workSpaceSize=*/1024 * 1024, /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - - TensorShape input_shape = - make_tensor_shape_from_legion_dims({size_per_input}, DataType::FLOAT); - TensorShape output_shape = make_tensor_shape_from_legion_dims( - {num_inputs, size_per_input}, DataType::FLOAT); - Allocator allocator = create_local_cuda_memory_allocator(); + const nonnegative_int num_inputs = 4_n; + SUBCASE("forward_kernel") { - std::vector input_accessors = - repeat(num_inputs, [&]() { - return read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - }); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); - - Kernels::Concat::forward_kernel(managed_stream.raw_stream(), - output_accessor, - input_accessors, - concat_axis); - - CHECK(contains_non_zero(output_accessor)); + auto run_forward_test = [&](nonnegative_int input_rows, + nonnegative_int input_cols, + TensorShape output_shape, + ff_dim_t concat_axis) { + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {input_rows, input_cols}, DataType::FLOAT); + + std::vector input_accessors = + repeat(num_inputs, [&]() { + return create_random_filled_accessor_r(input_shape, allocator); + }); + + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); + + Kernels::Concat::forward_kernel(managed_stream.raw_stream(), + output_accessor, + input_accessors, + concat_axis); + + CHECK(contains_non_zero(output_accessor)); + }; + + SUBCASE("test forward concat, axis = 0") { + nonnegative_int input_rows = 2_n; + nonnegative_int input_cols = 4_n; + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {num_inputs * input_rows, input_cols}, DataType::FLOAT); + run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n}); + } + + SUBCASE("test forward concat, axis = 1") { + nonnegative_int input_rows = 4_n; + nonnegative_int input_cols = 2_n; + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {input_rows, num_inputs * input_cols}, DataType::FLOAT); + run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n}); + } } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - create_random_filled_accessor_r(output_shape, allocator); - std::vector input_grad_accessors = repeat( - num_inputs, [&]() { return allocator.allocate_tensor(input_shape); }); - - Kernels::Concat::backward_kernel(managed_stream.raw_stream(), - output_grad_accessor, - input_grad_accessors, - concat_axis); + auto run_backward_test = [&](nonnegative_int input_rows, + nonnegative_int input_cols, + TensorShape output_shape, + ff_dim_t concat_axis) { + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {input_rows, input_cols}, DataType::FLOAT); + + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); + + std::vector input_grad_accessors = + repeat(num_inputs, [&]() { + return create_zero_filled_accessor_w(input_shape, allocator); + }); + + Kernels::Concat::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor, + input_grad_accessors, + concat_axis); + + for (auto &accessor : input_grad_accessors) { + CHECK(contains_non_zero(accessor)); + } + }; + + SUBCASE("test backward concat, axis = 0") { + nonnegative_int input_rows = 2_n; + nonnegative_int input_cols = 4_n; + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {num_inputs * input_rows, input_cols}, DataType::FLOAT); + run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n}); + } + + SUBCASE("test backward concat, axis = 1") { + nonnegative_int input_rows = 4_n; + nonnegative_int input_cols = 2_n; + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {input_rows, num_inputs * input_cols}, DataType::FLOAT); + run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n}); + } } } } diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index b75614588c..043617c790 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -3,54 +3,87 @@ #include "test_utils.h" using namespace ::FlexFlow; + TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Gather Forward and Backward Kernel") { ManagedPerDeviceFFHandle managed_handle{ /*workSpaceSize=*/1024 * 1024, /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - Allocator allocator = create_local_cuda_memory_allocator(); GatherPerDeviceState state = {managed_handle.raw_handle(), - legion_dim_t{2_n}}; + legion_dim_t{0_n}}; - TensorShape input_shape = - make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); - TensorShape output_shape = - make_tensor_shape_from_legion_dims({50_n}, DataType::FLOAT); + SUBCASE("forward_kernel") { + auto run_forward_test = [&](TensorShape input_shape, + TensorShape index_shape, + TensorShape output_shape) { + GenericTensorAccessorR input_accessor = + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); - GenericTensorAccessorR index_accessor = - create_random_filled_accessor_r(output_shape, allocator); + Kernels::Gather::forward_kernel(managed_stream.raw_stream(), + state, + input_accessor, + index_accessor, + output_accessor); - SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - create_random_filled_accessor_r(input_shape, allocator); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); - - Kernels::Gather::forward_kernel(managed_stream.raw_stream(), - state, - input_accessor, - index_accessor, - output_accessor); - - CHECK(contains_non_zero(output_accessor)); + CHECK(contains_non_zero(output_accessor)); + }; + + SUBCASE("test gather forward, 2D") { + TensorShape input_shape = + make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT); + TensorShape index_shape = + make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::INT32); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::FLOAT); + run_forward_test(input_shape, index_shape, output_shape); + } + + SUBCASE("test gather forward, 1D") { + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); + TensorShape index_shape = + make_tensor_shape_from_legion_dims({10_n}, DataType::INT32); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({10_n}, DataType::FLOAT); + run_forward_test(input_shape, index_shape, output_shape); + } } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - create_random_filled_accessor_r(output_shape, allocator); - GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); - - Kernels::Gather::backward_kernel(managed_stream.raw_stream(), - state, - output_grad_accessor, - index_accessor, - input_grad_accessor); - - CHECK(contains_non_zero(input_grad_accessor)); + auto run_backward_test = [&](TensorShape input_shape, + TensorShape index_shape, + TensorShape output_shape) { + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW input_grad_accessor = + allocator.allocate_tensor(input_shape); + + Kernels::Gather::backward_kernel(managed_stream.raw_stream(), + state, + output_grad_accessor, + index_accessor, + input_grad_accessor); + CHECK(contains_non_zero(input_grad_accessor)); + }; + + SUBCASE("test gather backward, 2D") { + TensorShape input_shape = + make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT); + TensorShape index_shape = + make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::INT32); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::FLOAT); + run_backward_test(input_shape, index_shape, output_shape); + } } } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 677f1f8f5e..87834d83ac 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -53,7 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = make_tensor_shape_from_legion_dims({5_n}, DataType::FLOAT); TensorShape output_shape = make_tensor_shape_from_legion_dims( - {5_n, num_replicas}, DataType::FLOAT); + {num_replicas, 5_n}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{ /*workSpaceSize=*/1024 * 1024, diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 4628cbd371..481958fdfc 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -7,9 +7,9 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Call Reverse Forward and Backward Kernels") { + nonnegative_int num_out_blks = 1_n; nonnegative_int reverse_dim_size = 10_n; nonnegative_int in_blk_size = 10_n; - nonnegative_int num_out_blks = 1_n; TensorShape input_shape = make_tensor_shape_from_legion_dims( {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); @@ -61,9 +61,9 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { - nonnegative_int num_out_blks = 4_n; - nonnegative_int reverse_dim_size = 3_n; - nonnegative_int in_blk_size = 2_n; + nonnegative_int num_out_blks = 1_n; + nonnegative_int reverse_dim_size = 4_n; + nonnegative_int in_blk_size = 3_n; TensorShape input_shape = make_tensor_shape_from_legion_dims( {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); @@ -78,10 +78,6 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); SUBCASE("forward_kernel") { - auto transform = [counter = 0.0f](float val) mutable { - return counter++; - }; - // Run GPU Cast Forward Kernel GenericTensorAccessorR input_accessor_gpu = create_random_filled_accessor_r(input_shape, gpu_allocator); @@ -103,8 +99,12 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW output_accessor_cpu = create_zero_filled_accessor_w(output_shape, cpu_allocator); - Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu, - output_accessor_cpu); + Kernels::Reverse::cpu_forward_kernel( + input_accessor_cpu, + output_accessor_cpu, + num_out_blks.unwrap_nonnegative(), + reverse_dim_size.unwrap_nonnegative(), + in_blk_size.unwrap_nonnegative()); CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu)); } @@ -113,6 +113,7 @@ TEST_SUITE(FF_TEST_SUITE) { // Run GPU Cast Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = create_random_filled_accessor_r(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = create_zero_filled_accessor_w(input_shape, gpu_allocator); @@ -131,8 +132,12 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW input_grad_accessor_cpu = create_zero_filled_accessor_w(input_shape, cpu_allocator); - Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu, - input_grad_accessor_cpu); + Kernels::Reverse::cpu_backward_kernel( + output_grad_accessor_cpu, + input_grad_accessor_cpu, + num_out_blks.unwrap_nonnegative(), + reverse_dim_size.unwrap_nonnegative(), + in_blk_size.unwrap_nonnegative()); CHECK(accessors_are_equal(input_grad_accessor_gpu, input_grad_accessor_cpu)); diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index 117c13a035..bc5f48654a 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -5,13 +5,6 @@ namespace FlexFlow { -GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, - Allocator &allocator) { - GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape); - fill_with_zeros(result_accessor); - return result_accessor; -} - TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims, DataType DT) { return TensorShape{ @@ -22,6 +15,20 @@ TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims, }; } +GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape); + fill_with_zeros(result_accessor); + return result_accessor; +} + +GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_zero_filled_accessor_w(shape, allocator); + return read_only_accessor_from_write_accessor(accessor); +} + template struct CreateRandomFilledAccessorW { GenericTensorAccessorW operator()(TensorShape const &shape, @@ -46,7 +53,7 @@ struct CreateRandomFilledAccessorW { data_ptr[i] = dist(gen); } } else if constexpr (std::is_integral::value) { - std::uniform_int_distribution dist(0, 100); + std::uniform_int_distribution dist(0, 99); for (size_t i = 0; i < num_elements; i++) { data_ptr[i] = dist(gen); } @@ -145,15 +152,19 @@ template struct Print2DCPUAccessorR { void operator()(GenericTensorAccessorR const &accessor, std::ostream &stream) { - int rows = accessor.shape.at(legion_dim_t{0_n}); - int cols = accessor.shape.at(legion_dim_t{1_n}); + int const dims = accessor.shape.num_dims(); + int const cols = accessor.shape.at(legion_dim_t{0_n}); + int const rows = (dims == 2) ? accessor.shape.at(legion_dim_t{1_n}) : 1_n; + + auto get_element = [dims, &accessor](int j, int i) { + return (dims == 1) ? accessor.at
({j}) : accessor.at
({j, i}); + }; std::vector indices(cols); std::iota(indices.begin(), indices.end(), 0); - - for (int i = 0; i < rows; i++) { - stream << join_strings(indices, " ", [&](int k) { - return accessor.at
({i, k}); + for (int i = 0; i < rows; ++i) { + stream << join_strings(indices, " ", [=](int j) { + return get_element(j, i); }) << std::endl; } } @@ -165,7 +176,7 @@ void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor, GenericTensorAccessorR cpu_accessor = copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); DataTypeDispatch1{}( - accessor.data_type, accessor, stream); + accessor.data_type, cpu_accessor, stream); } template diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index 1d60562322..093a9a4a97 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -26,6 +26,9 @@ GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, Allocator &allocator); +GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape, + Allocator &allocator); + TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims, DataType DT); @@ -41,7 +44,8 @@ GenericTensorAccessorR copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, Allocator &allocator); -void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor); +void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor, + std::ostream &stream); bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, GenericTensorAccessorR const &accessor_b); diff --git a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h index 0bcc8cfd6f..150fb9ba8b 100644 --- a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h +++ b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h @@ -43,6 +43,9 @@ class nonnegative_int { nonnegative_int operator++(int); nonnegative_int &operator+=(nonnegative_int const &other); + nonnegative_int operator-(nonnegative_int const &other) const; + nonnegative_int &operator-=(nonnegative_int const &other); + nonnegative_int operator*(nonnegative_int const &other) const; nonnegative_int &operator*=(nonnegative_int const &other); diff --git a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc index e86c242250..75e2a349ec 100644 --- a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc +++ b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc @@ -101,6 +101,15 @@ nonnegative_int &nonnegative_int::operator+=(nonnegative_int const &other) { return *this; } +nonnegative_int nonnegative_int::operator-(nonnegative_int const &other) const { + return nonnegative_int{this->value_ - other.value_}; +} + +nonnegative_int &nonnegative_int::operator-=(nonnegative_int const &other) { + *this = nonnegative_int{this->value_ - other.value_}; + return *this; +} + nonnegative_int nonnegative_int::operator*(nonnegative_int const &other) const { return nonnegative_int{this->value_ * other.value_}; } From 157407d3f0b8511d2cac18e7c3d7ac1a60816c42 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Sun, 9 Feb 2025 20:44:42 -0800 Subject: [PATCH 39/42] managed stream / handle test case fix --- .../test/src/test_managed_ff_stream.cc | 97 ++++++++++++++----- .../src/test_managed_per_device_ff_handle.cc | 6 +- 2 files changed, 76 insertions(+), 27 deletions(-) diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc index 3535dd258c..87b564d284 100644 --- a/lib/kernels/test/src/test_managed_ff_stream.cc +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -1,36 +1,89 @@ #include "doctest/doctest.h" -#include "kernels/managed_ff_stream.h" +#include "kernels/gather_kernels.h" +#include "test_utils.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("ManagedFFStream") { - ManagedFFStream base_stream{}; - ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); - - SUBCASE("move constructor") { - ManagedFFStream new_stream(std::move(base_stream)); - CHECK(&base_stream.raw_stream() == nullptr); - CHECK(&new_stream.raw_stream() == base_stream_ptr); - } + TEST_CASE("Test ManagedFFStream") { + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + Allocator allocator = create_local_cuda_memory_allocator(); + + GatherPerDeviceState state = {managed_handle.raw_handle(), + legion_dim_t{0_n}}; + + SUBCASE("forward_kernel") { + auto run_forward_test = [&](TensorShape input_shape, + TensorShape index_shape, + TensorShape output_shape) { + GenericTensorAccessorR input_accessor = + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); + + Kernels::Gather::forward_kernel(managed_stream.raw_stream(), + state, + input_accessor, + index_accessor, + output_accessor); - SUBCASE("move assignment operator") { - SUBCASE("move assign to other") { - ManagedFFStream new_stream{}; - new_stream = std::move(base_stream); - CHECK(&base_stream.raw_stream() == nullptr); - CHECK(&new_stream.raw_stream() == base_stream_ptr); + CHECK(contains_non_zero(output_accessor)); + }; + + SUBCASE("test gather forward, 2D") { + TensorShape input_shape = + make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT); + TensorShape index_shape = + make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::INT32); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::FLOAT); + run_forward_test(input_shape, index_shape, output_shape); } - SUBCASE("move assign to self") { - base_stream = std::move(base_stream); - CHECK(&base_stream.raw_stream() == base_stream_ptr); + SUBCASE("test gather forward, 1D") { + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); + TensorShape index_shape = + make_tensor_shape_from_legion_dims({10_n}, DataType::INT32); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({10_n}, DataType::FLOAT); + run_forward_test(input_shape, index_shape, output_shape); } } - SUBCASE("Test Self-Assignment") { - base_stream = std::move(base_stream); - CHECK(&base_stream.raw_stream() == base_stream_ptr); + SUBCASE("backward_kernel") { + auto run_backward_test = [&](TensorShape input_shape, + TensorShape index_shape, + TensorShape output_shape) { + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW input_grad_accessor = + allocator.allocate_tensor(input_shape); + + Kernels::Gather::backward_kernel(managed_stream.raw_stream(), + state, + output_grad_accessor, + index_accessor, + input_grad_accessor); + CHECK(contains_non_zero(input_grad_accessor)); + }; + + SUBCASE("test gather backward, 2D") { + TensorShape input_shape = + make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT); + TensorShape index_shape = + make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::INT32); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::FLOAT); + run_backward_test(input_shape, index_shape, output_shape); + } } } } diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc index d081a0b07c..5902664a14 100644 --- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -4,7 +4,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("ManagedPerDeviceFFHandle") { + TEST_CASE("Test ManagedPerDeviceFFHandle") { ManagedPerDeviceFFHandle base_handle{/*workSpaceSize=*/1024 * 1024, /*allowTensorOpMathConversion=*/true}; PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); @@ -16,8 +16,6 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("move constructor") { ManagedPerDeviceFFHandle new_handle(std::move(base_handle)); - - CHECK(&base_handle.raw_handle() == nullptr); CHECK(&new_handle.raw_handle() == base_handle_ptr); } @@ -27,8 +25,6 @@ TEST_SUITE(FF_TEST_SUITE) { /*workSpaceSize=*/1024 * 1024, /*allowTensorOpMathConversion=*/true}; new_handle = std::move(base_handle); - - CHECK(&base_handle.raw_handle() == nullptr); CHECK(&new_handle.raw_handle() == base_handle_ptr); } From f73e7a1784b80a0d1584d5141e10f525497dd99c Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 24 Feb 2025 19:37:03 -0800 Subject: [PATCH 40/42] accessor, array_shape, copy_tensor_accessor, datatype_dispatch, allocator, and perf_metrics tests --- .envrc | 3 + .proj.toml | 1 + .vimrc | 8 ++ lib/kernels/include/kernels/accessor.h | 9 +- .../include/kernels/copy_tensor_accessor.h | 11 ++ lib/kernels/src/accessor.cc | 73 +++++----- lib/kernels/src/array_shape.cc | 38 ++++- lib/kernels/src/copy_tensor_accessor.cc | 59 ++++++++ lib/kernels/src/legion_dim.cc | 1 - lib/kernels/src/perf_metrics.cc | 5 +- lib/kernels/test/src/test_accessor.cc | 136 ++++++++++++++++++ lib/kernels/test/src/test_array_shape.cc | 105 ++++++++++++++ lib/kernels/test/src/test_attention_kernel.cc | 10 +- .../test/src/test_batch_matmul_kernel.cc | 6 +- .../test/src/test_batch_norm_kernel.cc | 8 +- lib/kernels/test/src/test_cast_kernel.cc | 10 +- lib/kernels/test/src/test_combine_kernel.cc | 10 +- lib/kernels/test/src/test_concat_kernel.cc | 12 +- .../test/src/test_copy_tensor_accessor.cc | 76 ++++++++++ .../test/src/test_datatype_dispatch.cc | 65 +++++++++ lib/kernels/test/src/test_dropout.cc | 2 +- lib/kernels/test/src/test_flat_kernel.cc | 2 +- lib/kernels/test/src/test_gather_kernels.cc | 18 +-- .../test/src/test_layer_norm_kernels.cc | 4 +- lib/kernels/test/src/test_legion_dim.cc | 29 ++++ .../test/src/test_local_cpu_allocator.cc | 19 +++ .../test/src/test_local_cuda_allocator.cc | 19 +++ .../test/src/test_managed_ff_stream.cc | 18 +-- lib/kernels/test/src/test_partition_kernel.cc | 2 +- lib/kernels/test/src/test_perf_metrics.cc | 127 ++++++++++++++++ lib/kernels/test/src/test_pool_2d_kernels.cc | 4 +- lib/kernels/test/src/test_reduction_kernel.cc | 4 +- lib/kernels/test/src/test_replicate_kernel.cc | 12 +- lib/kernels/test/src/test_reshape_kernel.cc | 2 +- lib/kernels/test/src/test_reverse_kernels.cc | 8 +- lib/kernels/test/src/test_softmax_kernel.cc | 2 +- lib/kernels/test/src/test_split_kernel.cc | 4 +- lib/kernels/test/src/test_transpose_kernel.cc | 2 +- lib/kernels/test/src/test_utils.cc | 63 +------- lib/kernels/test/src/test_utils.h | 16 +-- 40 files changed, 815 insertions(+), 188 deletions(-) create mode 100644 .envrc create mode 100644 .vimrc create mode 100644 lib/kernels/test/src/test_accessor.cc create mode 100644 lib/kernels/test/src/test_array_shape.cc create mode 100644 lib/kernels/test/src/test_copy_tensor_accessor.cc create mode 100644 lib/kernels/test/src/test_datatype_dispatch.cc create mode 100644 lib/kernels/test/src/test_legion_dim.cc create mode 100644 lib/kernels/test/src/test_local_cpu_allocator.cc create mode 100644 lib/kernels/test/src/test_local_cuda_allocator.cc create mode 100644 lib/kernels/test/src/test_perf_metrics.cc diff --git a/.envrc b/.envrc new file mode 100644 index 0000000000..2797f0f929 --- /dev/null +++ b/.envrc @@ -0,0 +1,3 @@ +source_up_if_exists + +use flake diff --git a/.proj.toml b/.proj.toml index 10307a6efa..b3b90bbada 100644 --- a/.proj.toml +++ b/.proj.toml @@ -15,6 +15,7 @@ build_targets = [ "models", "export-model-arch", "substitution-to-dot", + "kernels-tests", ] test_targets = [ diff --git a/.vimrc b/.vimrc new file mode 100644 index 0000000000..4c8a8a8279 --- /dev/null +++ b/.vimrc @@ -0,0 +1,8 @@ +" example search path configuration +set path=lib/runtime/**,lib/** + +" set build target +" let g:target = "pcg" + +" set test target +" let g:test_target = "utils-test" diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 52ca62e217..8bbcf3ef95 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -267,6 +267,12 @@ std::vector const *> return out; } +bool accessor_data_is_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b); + +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b); + GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &write_accessor); @@ -280,9 +286,6 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, std::pair get_shape_and_datatype(GenericTensorAccessorR const &accessor); -void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor, - GenericTensorAccessorR const &src_accessor); - } // namespace FlexFlow namespace FlexFlow { diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h index da8af71e4f..97b6254750 100644 --- a/lib/kernels/include/kernels/copy_tensor_accessor.h +++ b/lib/kernels/include/kernels/copy_tensor_accessor.h @@ -6,6 +6,9 @@ namespace FlexFlow { +void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor); + GenericTensorAccessorR copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, Allocator &allocator); @@ -14,6 +17,14 @@ GenericTensorAccessorW copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, Allocator &allocator); +GenericTensorAccessorW + copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor, + Allocator &allocator); + +GenericTensorAccessorR + copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, + Allocator &allocator); + } // namespace FlexFlow #endif diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index 1a0abec1c5..43f57717f8 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -1,38 +1,45 @@ #include "kernels/accessor.h" -#include "kernels/allocation.h" +#include "kernels/copy_tensor_accessor.h" #include "kernels/datatype_dispatch.h" +#include "kernels/local_cpu_allocator.h" +#include +#include namespace FlexFlow { -void copy_accessor_data_to_l_from_r( - GenericTensorAccessorW &dst_accessor, - GenericTensorAccessorR const &src_accessor) { - size_t num_bytes = - dst_accessor.shape.get_volume().unwrap_nonnegative() * - size_of_datatype(dst_accessor.data_type).unwrap_nonnegative(); - - DeviceType dst_device_type = dst_accessor.device_type; - DeviceType src_device_type = src_accessor.device_type; - - if (src_device_type == DeviceType::CPU && - dst_device_type == DeviceType::CPU) { - memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes); - } else if (src_device_type == DeviceType::CPU && - dst_device_type == DeviceType::GPU) { - checkCUDA(cudaMemcpy( - dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice)); - } else if (src_device_type == DeviceType::GPU && - dst_device_type == DeviceType::CPU) { - checkCUDA(cudaMemcpy( - dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost)); - } else { - assert(src_device_type == DeviceType::GPU); - assert(dst_device_type == DeviceType::GPU); - checkCUDA(cudaMemcpy(dst_accessor.ptr, - src_accessor.ptr, - num_bytes, - cudaMemcpyDeviceToDevice)); +template +struct AccessorDataIsEqual { + bool operator()(GenericTensorAccessorR const &a, + GenericTensorAccessorR const &b) { + int const num_elements = a.shape.num_elements().unwrap_nonnegative(); + if (num_elements != b.shape.num_elements().unwrap_nonnegative()) { + return false; + } + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + auto cpu_a = copy_accessor_r_to_cpu_if_necessary(a, cpu_allocator); + auto cpu_b = copy_accessor_r_to_cpu_if_necessary(b, cpu_allocator); + + using T = real_type_t
; + T const *a_ptr = cpu_a.get
(); + T const *b_ptr = cpu_b.get
(); + + return std::equal(a_ptr, a_ptr + num_elements, b_ptr); } +}; + +bool accessor_data_is_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + return DataTypeDispatch1{}( + accessor_a.data_type, accessor_a, accessor_b); +} + +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + return accessor_a.data_type == accessor_b.data_type && + accessor_a.device_type == accessor_b.device_type && + accessor_a.shape == accessor_b.shape && + accessor_data_is_equal(accessor_a, accessor_b); } GenericTensorAccessorW::operator GenericTensorAccessorR() const { @@ -56,12 +63,12 @@ std::tupletie() == other.tie(); + return accessors_are_equal(*this, other); } bool GenericTensorAccessorW::operator!=( GenericTensorAccessorW const &other) const { - return this->tie() != other.tie(); + return !(accessors_are_equal(*this, other)); } int32_t *GenericTensorAccessorW::get_int32_ptr() const { @@ -112,12 +119,12 @@ std::tupletie() == other.tie(); + return accessors_are_equal(*this, other); } bool GenericTensorAccessorR::operator!=( GenericTensorAccessorR const &other) const { - return this->tie() != other.tie(); + return !(accessors_are_equal(*this, other)); } int32_t const *GenericTensorAccessorR::get_int32_ptr() const { diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 30db65cc03..499aebad86 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -51,18 +51,40 @@ nonnegative_int ArrayShape::at(ff_dim_t idx) const { return dims.at(legion_dim_from_ff_dim(idx, this->num_dims())); } +legion_dim_t ArrayShape::last_idx() const { + if (this->dims.size() == 0) { + throw mk_runtime_error("Cannot get last index of an empty shape"); + } + return legion_dim_t(nonnegative_int{this->dims.size() - 1}); +} + +legion_dim_t ArrayShape::neg_idx(int idx) const { + if (std::abs(idx) > this->dims.size()) { + throw mk_runtime_error( + fmt::format("Invalid negative index: {} (shape has {} dimensions)", + idx, + this->dims.size())); + } + + if (idx >= 0) { + throw mk_runtime_error(fmt::format( + "Idx should be negative for negative indexing, got {}", idx)); + } + + return legion_dim_t(nonnegative_int{this->dims.size() + idx}); +} + bool ArrayShape::operator==(ArrayShape const &other) const { - return this->tie() == other.tie(); + return this->dims == other.dims; } bool ArrayShape::operator!=(ArrayShape const &other) const { - return this->tie() != other.tie(); + return !(this->dims == other.dims); } ArrayShape ArrayShape::sub_shape( std::optional> start, std::optional> end) const { - nonnegative_int num_dims = this->num_dims(); auto to_legion_index = [num_dims](auto arg) -> nonnegative_int { @@ -85,7 +107,9 @@ ArrayShape ArrayShape::sub_shape( "Invalid sub_shape range: start={}, end={}", start_idx, end_idx)); } - return ArrayShape(&this->dims[legion_dim_t{start_idx}], end_idx - start_idx); + return ArrayShape(std::vector( + this->dims.begin() + start_idx.unwrap_nonnegative(), + this->dims.begin() + end_idx.unwrap_nonnegative())); } std::optional ArrayShape::at_maybe(legion_dim_t index) const { @@ -97,7 +121,11 @@ std::optional ArrayShape::at_maybe(legion_dim_t index) const { } std::optional ArrayShape::at_maybe(ff_dim_t index) const { - return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims())); + if (index.value < this->num_dims()) { + return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims())); + } else { + return std::nullopt; + } } std::tuple const &> ArrayShape::tie() const { diff --git a/lib/kernels/src/copy_tensor_accessor.cc b/lib/kernels/src/copy_tensor_accessor.cc index 6a3ad8033a..cc033223f8 100644 --- a/lib/kernels/src/copy_tensor_accessor.cc +++ b/lib/kernels/src/copy_tensor_accessor.cc @@ -3,6 +3,37 @@ namespace FlexFlow { +void copy_accessor_data_to_l_from_r( + GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor) { + size_t num_bytes = + dst_accessor.shape.get_volume().unwrap_nonnegative() * + size_of_datatype(dst_accessor.data_type).unwrap_nonnegative(); + + DeviceType dst_device_type = dst_accessor.device_type; + DeviceType src_device_type = src_accessor.device_type; + + if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::CPU) { + memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes); + } else if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::GPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice)); + } else if (src_device_type == DeviceType::GPU && + dst_device_type == DeviceType::CPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost)); + } else { + assert(src_device_type == DeviceType::GPU); + assert(dst_device_type == DeviceType::GPU); + checkCUDA(cudaMemcpy(dst_accessor.ptr, + src_accessor.ptr, + num_bytes, + cudaMemcpyDeviceToDevice)); + } +} + template struct CopyTensorAccessorW { GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, @@ -45,4 +76,32 @@ GenericTensorAccessorR src_accessor.data_type, src_accessor, allocator); } +GenericTensorAccessorR + copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, + Allocator &cpu_allocator) { + if (cpu_allocator.get_allocation_device_type() == DeviceType::GPU) { + throw mk_runtime_error("Allocator must be a CPU allocator"); + } + + GenericTensorAccessorR cpu_accessor = accessor; + if (accessor.device_type == DeviceType::GPU) { + cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator); + } + return cpu_accessor; +} + +GenericTensorAccessorW + copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor, + Allocator &cpu_allocator) { + if (cpu_allocator.get_allocation_device_type() == DeviceType::GPU) { + throw mk_runtime_error("Allocator must be a CPU allocator"); + } + + GenericTensorAccessorW cpu_accessor = accessor; + if (accessor.device_type == DeviceType::GPU) { + cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator); + } + return cpu_accessor; +} + } // namespace FlexFlow diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc index 4e7fc56848..14016a6202 100644 --- a/lib/kernels/src/legion_dim.cc +++ b/lib/kernels/src/legion_dim.cc @@ -10,7 +10,6 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) { legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, nonnegative_int num_dimensions) { return legion_dim_t{num_dimensions - ff_dim.value - 1_n}; - ; } } // namespace FlexFlow diff --git a/lib/kernels/src/perf_metrics.cc b/lib/kernels/src/perf_metrics.cc index 2036ddd35a..ab0e113a26 100644 --- a/lib/kernels/src/perf_metrics.cc +++ b/lib/kernels/src/perf_metrics.cc @@ -15,8 +15,9 @@ PerfMetrics::PerfMetrics(int _train_all, double _start_time_micro, double _current_time_micro) : train_all(_train_all), train_correct(_train_correct), cce_loss(_cce_loss), - mse_loss(_mse_loss), rmse_loss(_rmse_loss), mae_loss(_mae_loss), - start_time(_start_time_micro), current_time(_current_time_micro) {} + sparse_cce_loss(_sparse_cce_loss), mse_loss(_mse_loss), + rmse_loss(_rmse_loss), mae_loss(_mae_loss), start_time(_start_time_micro), + current_time(_current_time_micro) {} float get_throughput(PerfMetrics const &m) { return m.train_all / (m.current_time - m.start_time); diff --git a/lib/kernels/test/src/test_accessor.cc b/lib/kernels/test/src/test_accessor.cc new file mode 100644 index 0000000000..e9611a928c --- /dev/null +++ b/lib/kernels/test/src/test_accessor.cc @@ -0,0 +1,136 @@ +#include "doctest/doctest.h" +#include "kernels/accessor.h" +#include "op-attrs/datatype_value.h" +#include "test_utils.h" + +using namespace ::FlexFlow; + +template +void check_accessor_get(GenericTensorAccessorR const &accessor, + real_type_t
expected) { + CHECK(*accessor.get
() == expected); + + if constexpr (DT == DataType::INT32) { + CHECK(*accessor.get_int32_ptr() == expected); + } else if constexpr (DT == DataType::INT64) { + CHECK(*accessor.get_int64_ptr() == expected); + } else if constexpr (DT == DataType::FLOAT) { + CHECK(*accessor.get_float_ptr() == doctest::Approx(expected)); + } else if constexpr (DT == DataType::DOUBLE) { + CHECK(*accessor.get_double_ptr() == doctest::Approx(expected)); + } else if constexpr (DT == DataType::HALF) { + CHECK(*accessor.get_half_ptr() == doctest::Approx(expected)); + } +} + +template +void run_accessor_w_test(DataTypeValue value, + real_type_t
expected, + Allocator allocator) { + TensorShape shape = make_tensor_shape_from_ff_ordered({1_n}, DT); + GenericTensorAccessorW accessor = + create_filled_accessor_w(shape, allocator, value); + check_accessor_get
(read_only_accessor_from_write_accessor(accessor), + expected); +} + +template +void run_accessor_r_test(DataTypeValue value, + real_type_t
expected, + Allocator allocator) { + TensorShape shape = make_tensor_shape_from_ff_ordered({1_n}, DT); + GenericTensorAccessorR accessor = + create_filled_accessor_r(shape, allocator, value); + check_accessor_get
(accessor, expected); +} + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test GenericTensorAccessors") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("Test GenericTensorAccessorW") { + SUBCASE("Test get methods for GenericTensorAccessorW") { + run_accessor_w_test( + make_int32_data_type_value(12345), 12345, cpu_allocator); + run_accessor_w_test( + make_int64_data_type_value(12345LL), 12345LL, cpu_allocator); + run_accessor_w_test( + make_float_data_type_value(1.23f), 1.23f, cpu_allocator); + run_accessor_w_test( + make_double_data_type_value(1.23), 1.23, cpu_allocator); + } + + SUBCASE("Test operator== and operator!= for GenericTensorAccessorW") { + TensorShape shape = + make_tensor_shape_from_ff_ordered({1_n}, DataType::INT32); + + GenericTensorAccessorW accessor1 = create_filled_accessor_w( + shape, cpu_allocator, make_int32_data_type_value(12345)); + GenericTensorAccessorW accessor2 = create_filled_accessor_w( + shape, cpu_allocator, make_int32_data_type_value(12345)); + GenericTensorAccessorW accessor3 = create_filled_accessor_w( + shape, cpu_allocator, make_int32_data_type_value(54321)); + + CHECK(accessor1 == accessor2); + CHECK(accessor1 != accessor3); + } + + SUBCASE("Test at() method for GenericTensorAccessorW") { + DataType const DT = DataType::INT32; + TensorShape shape = make_tensor_shape_from_ff_ordered({3_n, 3_n}, DT); + + GenericTensorAccessorW accessor_1 = + create_random_filled_accessor_w(shape, cpu_allocator); + GenericTensorAccessorW accessor_2 = + copy_tensor_accessor_w(accessor_1, cpu_allocator); + + CHECK(accessor_1.at
({0, 0}) == accessor_2.at
({0, 0})); + CHECK(accessor_1.at
({1, 0}) == accessor_2.at
({1, 0})); + CHECK(accessor_1.at
({2, 2}) == accessor_2.at
({2, 2})); + } + } + + SUBCASE("Test GenericTensorAccessorR") { + + SUBCASE("Test get methods for GenericTensorAccessorR") { + run_accessor_r_test( + make_int32_data_type_value(12345), 12345, cpu_allocator); + run_accessor_r_test( + make_int64_data_type_value(12345LL), 12345LL, cpu_allocator); + run_accessor_r_test( + make_float_data_type_value(1.23f), 1.23f, cpu_allocator); + run_accessor_r_test( + make_double_data_type_value(1.23), 1.23, cpu_allocator); + } + + SUBCASE("Test operator== and operator!= for GenericTensorAccessorR") { + TensorShape shape = + make_tensor_shape_from_ff_ordered({1_n}, DataType::INT32); + + GenericTensorAccessorR accessor1 = create_filled_accessor_r( + shape, cpu_allocator, make_int32_data_type_value(12345)); + GenericTensorAccessorR accessor2 = create_filled_accessor_r( + shape, cpu_allocator, make_int32_data_type_value(12345)); + GenericTensorAccessorR accessor3 = create_filled_accessor_r( + shape, cpu_allocator, make_int32_data_type_value(54321)); + + CHECK(accessor1 == accessor2); + CHECK(accessor1 != accessor3); + } + + SUBCASE("Test at() method for GenericTensorAccessorR") { + DataType const DT = DataType::INT32; + TensorShape shape = make_tensor_shape_from_ff_ordered({3_n, 3_n}, DT); + + GenericTensorAccessorR accessor_1 = + create_random_filled_accessor_r(shape, cpu_allocator); + GenericTensorAccessorR accessor_2 = + copy_tensor_accessor_r(accessor_1, cpu_allocator); + + CHECK(accessor_1.at
({0, 0}) == accessor_2.at
({0, 0})); + CHECK(accessor_1.at
({1, 0}) == accessor_2.at
({1, 0})); + CHECK(accessor_1.at
({2, 2}) == accessor_2.at
({2, 2})); + } + } + } +} diff --git a/lib/kernels/test/src/test_array_shape.cc b/lib/kernels/test/src/test_array_shape.cc new file mode 100644 index 0000000000..7ede1791ef --- /dev/null +++ b/lib/kernels/test/src/test_array_shape.cc @@ -0,0 +1,105 @@ +#include "doctest/doctest.h" +#include "kernels/array_shape.h" +#include "test_utils.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test ArrayShape") { + ArrayShape shape({1_n, 2_n, 3_n, 4_n}); + + SUBCASE("Test get_volume() and num_elements()") { + CHECK(shape.get_volume() == 1 * 2 * 3 * 4); + CHECK(shape.num_elements() == 1 * 2 * 3 * 4); + } + + SUBCASE("Test num_dims() and get_dim()") { + CHECK(shape.num_dims() == 4); + CHECK(shape.get_dim() == 4); + } + + SUBCASE("Test operator[] and at()") { + CHECK(shape[legion_dim_t{0_n}] == 1); + CHECK(shape[legion_dim_t{1_n}] == 2); + CHECK(shape[legion_dim_t{2_n}] == 3); + CHECK(shape[legion_dim_t{3_n}] == 4); + + CHECK(shape.at(legion_dim_t{0_n}) == 1); + CHECK(shape.at(legion_dim_t{1_n}) == 2); + CHECK(shape.at(legion_dim_t{2_n}) == 3); + CHECK(shape.at(legion_dim_t{3_n}) == 4); + + CHECK(shape.at(ff_dim_t{0_n}) == 4); + CHECK(shape.at(ff_dim_t{1_n}) == 3); + CHECK(shape.at(ff_dim_t{2_n}) == 2); + CHECK(shape.at(ff_dim_t{3_n}) == 1); + } + + SUBCASE("Test operator== and operator!=") { + ArrayShape shape2({1_n, 2_n, 3_n, 4_n}); + ArrayShape shape3({1_n, 2_n, 3_n, 5_n}); + + CHECK(shape == shape2); + CHECK(shape != shape3); + } + + SUBCASE("Test last_idx()") { + CHECK(shape.last_idx() == legion_dim_t{3_n}); + + ArrayShape empty_shape(std::vector{}); + CHECK_THROWS(empty_shape.last_idx()); + } + + SUBCASE("Test neg_idx()") { + CHECK(shape.neg_idx(-1) == legion_dim_t{3_n}); + CHECK(shape.neg_idx(-2) == legion_dim_t{2_n}); + CHECK(shape.neg_idx(-3) == legion_dim_t{1_n}); + CHECK(shape.neg_idx(-4) == legion_dim_t{0_n}); + + CHECK_THROWS(shape.neg_idx(-5)); + } + + SUBCASE("Test at_maybe()") { + CHECK(shape.at_maybe(legion_dim_t{0_n}).value() == 1); + CHECK(shape.at_maybe(legion_dim_t{1_n}).value() == 2); + CHECK(shape.at_maybe(legion_dim_t{2_n}).value() == 3); + CHECK(shape.at_maybe(legion_dim_t{3_n}).value() == 4); + CHECK(!shape.at_maybe(legion_dim_t{4_n}).has_value()); + + CHECK(shape.at_maybe(ff_dim_t{0_n}).value() == 4); + CHECK(shape.at_maybe(ff_dim_t{1_n}).value() == 3); + CHECK(shape.at_maybe(ff_dim_t{2_n}).value() == 2); + CHECK(shape.at_maybe(ff_dim_t{3_n}).value() == 1); + CHECK(!shape.at_maybe(ff_dim_t{4_n}).has_value()); + } + + SUBCASE("Test subshape()") { + SUBCASE("Test basic subshape") { + ArrayShape ref_shape({2_n, 3_n}); + ArrayShape subshape = + shape.sub_shape(legion_dim_t{1_n}, legion_dim_t{3_n}); + + CHECK(ref_shape == subshape); + } + + SUBCASE("Test empty subshape") { + ArrayShape ref_shape(std::vector{}); + ArrayShape subshape = + shape.sub_shape(legion_dim_t{0_n}, legion_dim_t{0_n}); + CHECK(ref_shape == subshape); + } + + SUBCASE("Test subshape with no start") { + ArrayShape ref_shape({1_n, 2_n, 3_n}); + ArrayShape subshape = shape.sub_shape(std::nullopt, legion_dim_t{3_n}); + CHECK(ref_shape == subshape); + } + + SUBCASE("Test subshape with no end") { + ArrayShape ref_shape({2_n, 3_n, 4_n}); + ArrayShape subshape = shape.sub_shape(legion_dim_t{1_n}, std::nullopt); + CHECK(ref_shape == subshape); + } + } + } +} diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index bd0167a677..6b54554a9b 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -41,15 +41,15 @@ TEST_SUITE(FF_TEST_SUITE) { /*kvSeqLength=*/kvSeqLength.unwrap_nonnegative(), /*add_bias_kv=*/false); - TensorShape query_shape = make_tensor_shape_from_legion_dims( + TensorShape query_shape = make_tensor_shape_from_ff_ordered( {qoSeqLength, num_samples, qSize}, DataType::FLOAT); - TensorShape key_shape = make_tensor_shape_from_legion_dims( + TensorShape key_shape = make_tensor_shape_from_ff_ordered( {kvSeqLength, num_samples, kSize}, DataType::FLOAT); - TensorShape value_shape = make_tensor_shape_from_legion_dims( + TensorShape value_shape = make_tensor_shape_from_ff_ordered( {kvSeqLength, num_samples, vSize}, DataType::FLOAT); - TensorShape output_shape = make_tensor_shape_from_legion_dims( + TensorShape output_shape = make_tensor_shape_from_ff_ordered( {qoSeqLength, num_samples, oProjSize}, DataType::FLOAT); - TensorShape weight_shape = make_tensor_shape_from_legion_dims( + TensorShape weight_shape = make_tensor_shape_from_ff_ordered( {nonnegative_int{state.weightSize}}, DataType::FLOAT); GenericTensorAccessorW query_accessor = diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index d78d5daee5..ba9b3ac0e2 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -22,11 +22,11 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape_a = - make_tensor_shape_from_legion_dims({m, k, batch}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({m, k, batch}, DataType::FLOAT); TensorShape input_shape_b = - make_tensor_shape_from_legion_dims({k, n, batch}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({k, n, batch}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({m, n, batch}, DataType::FLOAT); GenericTensorAccessorW a_accessor = create_random_filled_accessor_w(input_shape_a, allocator); diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index d0ec2559ba..698a320a69 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -29,13 +29,13 @@ TEST_SUITE(FF_TEST_SUITE) { /*output_w=*/output_w.unwrap_nonnegative(), /*relu=*/true); - TensorShape input_shape = make_tensor_shape_from_legion_dims( + TensorShape input_shape = make_tensor_shape_from_ff_ordered( {output_n, output_c, output_h, output_w}, DataType::FLOAT); - TensorShape output_shape = make_tensor_shape_from_legion_dims( + TensorShape output_shape = make_tensor_shape_from_ff_ordered( {output_n, output_c, output_h, output_w}, DataType::FLOAT); - TensorShape scale_shape = make_tensor_shape_from_legion_dims( + TensorShape scale_shape = make_tensor_shape_from_ff_ordered( {output_n, output_c, output_h, output_w}, DataType::FLOAT); - TensorShape bias_shape = make_tensor_shape_from_legion_dims( + TensorShape bias_shape = make_tensor_shape_from_ff_ordered( {output_n, output_c, output_h, output_w}, DataType::FLOAT); GenericTensorAccessorW input_accessor = diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index c59d8eae3f..d314a6bcc2 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -11,9 +11,9 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100_n, 100_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({100_n, 100_n}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({100_n, 100_n}, DataType::DOUBLE); + make_tensor_shape_from_ff_ordered({100_n, 100_n}, DataType::DOUBLE); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = @@ -48,9 +48,9 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({10_n, 2_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({10_n, 2_n}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({10_n, 2_n}, DataType::DOUBLE); + make_tensor_shape_from_ff_ordered({10_n, 2_n}, DataType::DOUBLE); // Only calling forward kernel as backward kernel is exactly the same SUBCASE("forward_kernel") { @@ -72,7 +72,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Cast::cpu_forward_kernel(input_accessor_cpu, output_accessor_cpu); - CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); + CHECK(accessor_data_is_equal(output_accessor_gpu, output_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 97fa81920b..b30d1ab7f4 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100_n, 100_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({100_n, 100_n}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { @@ -50,7 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({5_n, 5_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({5_n, 5_n}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { @@ -72,7 +72,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Combine::cpu_forward_kernel(input_accessor_cpu, output_accessor_cpu); - CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); + CHECK(accessor_data_is_equal(output_accessor_gpu, output_accessor_cpu)); } SUBCASE("backward_kernel") { @@ -95,8 +95,8 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu, input_grad_accessor_cpu); - CHECK(accessors_are_equal(input_grad_accessor_gpu, - input_grad_accessor_cpu)); + CHECK(accessor_data_is_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 22da72912a..f8bc31c3d5 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -19,7 +19,7 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int input_cols, TensorShape output_shape, ff_dim_t concat_axis) { - TensorShape input_shape = make_tensor_shape_from_legion_dims( + TensorShape input_shape = make_tensor_shape_from_ff_ordered( {input_rows, input_cols}, DataType::FLOAT); std::vector input_accessors = @@ -41,7 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("test forward concat, axis = 0") { nonnegative_int input_rows = 2_n; nonnegative_int input_cols = 4_n; - TensorShape output_shape = make_tensor_shape_from_legion_dims( + TensorShape output_shape = make_tensor_shape_from_ff_ordered( {num_inputs * input_rows, input_cols}, DataType::FLOAT); run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n}); } @@ -49,7 +49,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("test forward concat, axis = 1") { nonnegative_int input_rows = 4_n; nonnegative_int input_cols = 2_n; - TensorShape output_shape = make_tensor_shape_from_legion_dims( + TensorShape output_shape = make_tensor_shape_from_ff_ordered( {input_rows, num_inputs * input_cols}, DataType::FLOAT); run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n}); } @@ -60,7 +60,7 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int input_cols, TensorShape output_shape, ff_dim_t concat_axis) { - TensorShape input_shape = make_tensor_shape_from_legion_dims( + TensorShape input_shape = make_tensor_shape_from_ff_ordered( {input_rows, input_cols}, DataType::FLOAT); GenericTensorAccessorR output_grad_accessor = @@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("test backward concat, axis = 0") { nonnegative_int input_rows = 2_n; nonnegative_int input_cols = 4_n; - TensorShape output_shape = make_tensor_shape_from_legion_dims( + TensorShape output_shape = make_tensor_shape_from_ff_ordered( {num_inputs * input_rows, input_cols}, DataType::FLOAT); run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n}); } @@ -92,7 +92,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("test backward concat, axis = 1") { nonnegative_int input_rows = 4_n; nonnegative_int input_cols = 2_n; - TensorShape output_shape = make_tensor_shape_from_legion_dims( + TensorShape output_shape = make_tensor_shape_from_ff_ordered( {input_rows, num_inputs * input_cols}, DataType::FLOAT); run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n}); } diff --git a/lib/kernels/test/src/test_copy_tensor_accessor.cc b/lib/kernels/test/src/test_copy_tensor_accessor.cc new file mode 100644 index 0000000000..a6a4cfde53 --- /dev/null +++ b/lib/kernels/test/src/test_copy_tensor_accessor.cc @@ -0,0 +1,76 @@ +#include "doctest/doctest.h" +#include "kernels/accessor.h" +#include "op-attrs/datatype_value.h" +#include "test_utils.h" + +using namespace ::FlexFlow; +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test copy_tensor_accessor") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + TensorShape shape = + make_tensor_shape_from_ff_ordered({5_n, 5_n}, DataType::FLOAT); + + SUBCASE("Test copy_tensor_accessor_r") { + GenericTensorAccessorR src_accessor = + create_random_filled_accessor_r(shape, cpu_allocator); + GenericTensorAccessorR dst_accessor = + copy_tensor_accessor_r(src_accessor, cpu_allocator); + + CHECK(accessor_data_is_equal(src_accessor, dst_accessor)); + } + + SUBCASE("Test copy_tensor_accessor_w") { + GenericTensorAccessorW src_accessor = + create_random_filled_accessor_w(shape, cpu_allocator); + GenericTensorAccessorW dst_accessor = + copy_tensor_accessor_w(src_accessor, cpu_allocator); + + CHECK(accessor_data_is_equal(src_accessor, dst_accessor)); + } + + SUBCASE("Test copy_accessor_r_to_cpu_if_necessary") { + SUBCASE("Test necessary") { + GenericTensorAccessorR src_accessor = + create_random_filled_accessor_r(shape, gpu_allocator); + GenericTensorAccessorR dst_accessor = + copy_accessor_r_to_cpu_if_necessary(src_accessor, cpu_allocator); + + CHECK(accessor_data_is_equal(src_accessor, dst_accessor)); + CHECK(dst_accessor.device_type == DeviceType::CPU); + } + + SUBCASE("Test not necessary") { + GenericTensorAccessorR src_accessor = + create_random_filled_accessor_r(shape, cpu_allocator); + GenericTensorAccessorR dst_accessor = + copy_accessor_r_to_cpu_if_necessary(src_accessor, cpu_allocator); + + CHECK(accessor_data_is_equal(src_accessor, dst_accessor)); + CHECK(dst_accessor.device_type == DeviceType::CPU); + } + } + + SUBCASE("Test copy_accessor_w_to_cpu_if_necessary") { + SUBCASE("Test necessary") { + GenericTensorAccessorW src_accessor = + create_random_filled_accessor_w(shape, gpu_allocator); + GenericTensorAccessorW dst_accessor = + copy_accessor_w_to_cpu_if_necessary(src_accessor, cpu_allocator); + + CHECK(accessor_data_is_equal(src_accessor, dst_accessor)); + CHECK(dst_accessor.device_type == DeviceType::CPU); + } + + SUBCASE("Test not necessary") { + GenericTensorAccessorW src_accessor = + create_random_filled_accessor_w(shape, cpu_allocator); + GenericTensorAccessorW dst_accessor = + copy_accessor_w_to_cpu_if_necessary(src_accessor, cpu_allocator); + + CHECK(accessor_data_is_equal(src_accessor, dst_accessor)); + CHECK(dst_accessor.device_type == DeviceType::CPU); + } + } + } +} diff --git a/lib/kernels/test/src/test_datatype_dispatch.cc b/lib/kernels/test/src/test_datatype_dispatch.cc new file mode 100644 index 0000000000..41737d715a --- /dev/null +++ b/lib/kernels/test/src/test_datatype_dispatch.cc @@ -0,0 +1,65 @@ +#include "doctest/doctest.h" +#include "kernels/datatype_dispatch.h" + +using namespace ::FlexFlow; + +template +struct TestDatatypeDispatch1 { + int operator()(int value) { + if (DT == DataType::FLOAT) { + return value + 1; + } else if (DT == DataType::INT32) { + return value + 2; + } else { + return value + 3; + } + } +}; + +template +struct TestDatatypeDispatch2 { + void operator()(int &value) { + if (IDT == DataType::INT32 && ODT == DataType::FLOAT) { + value *= 2; + } else if (IDT == DataType::FLOAT && ODT == DataType::INT32) { + value *= 3; + } else { + value *= 4; + } + } +}; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test DataTypeDispatch") { + SUBCASE("Test DataTypeDispatch1") { + CHECK(DataTypeDispatch1{}(DataType::FLOAT, 1) == + 2); + CHECK(DataTypeDispatch1{}(DataType::INT32, 1) == + 3); + CHECK(DataTypeDispatch1{}(DataType::DOUBLE, 1) == + 4); + } + + SUBCASE("Test DataTypeDispatch2") { + int value = 1; + + SUBCASE("Case One") { + DataTypeDispatch2{}( + DataType::INT32, DataType::FLOAT, value); + CHECK(value == 2); + } + + SUBCASE("Case Two") { + DataTypeDispatch2{}( + DataType::FLOAT, DataType::INT32, value); + CHECK(value == 3); + } + + SUBCASE("Test Three") { + DataTypeDispatch2{}( + DataType::DOUBLE, DataType::DOUBLE, value); + CHECK(value == 4); + } + } + } +} diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 1a34c59be6..e5eba341f3 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorShape input_shape = - make_tensor_shape_from_legion_dims({10_n, 10_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({10_n, 10_n}, DataType::FLOAT); TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 238c4ac361..ee4554d00a 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) { ManagedFFStream managed_stream{}; TensorShape input_shape = - make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); TensorShape output_shape = input_shape; GenericTensorAccessorR input_accessor = diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 043617c790..64cc824b9b 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -37,21 +37,21 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("test gather forward, 2D") { TensorShape input_shape = - make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({2_n, 100_n}, DataType::FLOAT); TensorShape index_shape = - make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::INT32); + make_tensor_shape_from_ff_ordered({2_n, 20_n}, DataType::INT32); TensorShape output_shape = - make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({2_n, 20_n}, DataType::FLOAT); run_forward_test(input_shape, index_shape, output_shape); } SUBCASE("test gather forward, 1D") { TensorShape input_shape = - make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); TensorShape index_shape = - make_tensor_shape_from_legion_dims({10_n}, DataType::INT32); + make_tensor_shape_from_ff_ordered({10_n}, DataType::INT32); TensorShape output_shape = - make_tensor_shape_from_legion_dims({10_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({10_n}, DataType::FLOAT); run_forward_test(input_shape, index_shape, output_shape); } } @@ -77,11 +77,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("test gather backward, 2D") { TensorShape input_shape = - make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({2_n, 100_n}, DataType::FLOAT); TensorShape index_shape = - make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::INT32); + make_tensor_shape_from_ff_ordered({2_n, 25_n}, DataType::INT32); TensorShape output_shape = - make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({2_n, 25_n}, DataType::FLOAT); run_backward_test(input_shape, index_shape, output_shape); } } diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 8368fe4efd..4d5802936e 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -12,11 +12,11 @@ TEST_SUITE(FF_TEST_SUITE) { float epsilon = 1e-5f; bool elementwise_affine = true; - TensorShape input_shape = make_tensor_shape_from_legion_dims( + TensorShape input_shape = make_tensor_shape_from_ff_ordered( {batch_size, feature_size}, DataType::FLOAT); TensorShape output_shape = input_shape; TensorShape feature_shape = - make_tensor_shape_from_legion_dims({feature_size}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({feature_size}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{ /*workSpaceSize=*/1024 * 1024, diff --git a/lib/kernels/test/src/test_legion_dim.cc b/lib/kernels/test/src/test_legion_dim.cc new file mode 100644 index 0000000000..c06b779ad8 --- /dev/null +++ b/lib/kernels/test/src/test_legion_dim.cc @@ -0,0 +1,29 @@ +#include "doctest/doctest.h" +#include "kernels/legion_dim.h" + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test LegionDim") { + SUBCASE("Test add_to_legion_dim") { + legion_dim_t dim{1_n}; + CHECK(add_to_legion_dim(dim, 2) == legion_dim_t{3_n}); + } + + SUBCASE("Test legion_dim_from_ff_dim") { + CHECK(legion_dim_from_ff_dim(ff_dim_t{0_n}, 4_n) == legion_dim_t{3_n}); + CHECK(legion_dim_from_ff_dim(ff_dim_t{1_n}, 4_n) == legion_dim_t{2_n}); + CHECK(legion_dim_from_ff_dim(ff_dim_t{2_n}, 4_n) == legion_dim_t{1_n}); + CHECK(legion_dim_from_ff_dim(ff_dim_t{3_n}, 4_n) == legion_dim_t{0_n}); + } + + SUBCASE("Test LegionOrdered") { + LegionOrdered legion_ordered{1, 2, 3, 4}; + + SUBCASE("Test ff_ordered_from_legion_ordered") { + CHECK(ff_ordered_from_legion_ordered(legion_ordered) == + FFOrdered{4, 3, 2, 1}); + } + } + } +} diff --git a/lib/kernels/test/src/test_local_cpu_allocator.cc b/lib/kernels/test/src/test_local_cpu_allocator.cc new file mode 100644 index 0000000000..fa6bce36db --- /dev/null +++ b/lib/kernels/test/src/test_local_cpu_allocator.cc @@ -0,0 +1,19 @@ +#include "kernels/local_cpu_allocator.h" +#include "doctest/doctest.h" + +using namespace ::FlexFlow; +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test LocalCPUAllocator") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("Test allocate and deallocate") { + void *ptr = cpu_allocator.allocate(100); + CHECK(ptr != nullptr); + cpu_allocator.deallocate(ptr); + } + + SUBCASE("Test get_allocation_device_type") { + CHECK(cpu_allocator.get_allocation_device_type() == DeviceType::CPU); + } + } +} diff --git a/lib/kernels/test/src/test_local_cuda_allocator.cc b/lib/kernels/test/src/test_local_cuda_allocator.cc new file mode 100644 index 0000000000..c091576bd3 --- /dev/null +++ b/lib/kernels/test/src/test_local_cuda_allocator.cc @@ -0,0 +1,19 @@ +#include "kernels/local_cuda_allocator.h" +#include "doctest/doctest.h" + +using namespace ::FlexFlow; +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test LocalCUDAAllocator") { + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + + SUBCASE("Test allocate and deallocate") { + void *ptr = gpu_allocator.allocate(100); + CHECK(ptr != nullptr); + gpu_allocator.deallocate(ptr); + } + + SUBCASE("Test get_allocation_device_type") { + CHECK(gpu_allocator.get_allocation_device_type() == DeviceType::GPU); + } + } +} diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc index 87b564d284..841c9a82ab 100644 --- a/lib/kernels/test/src/test_managed_ff_stream.cc +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -37,21 +37,21 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("test gather forward, 2D") { TensorShape input_shape = - make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({2_n, 100_n}, DataType::FLOAT); TensorShape index_shape = - make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::INT32); + make_tensor_shape_from_ff_ordered({2_n, 20_n}, DataType::INT32); TensorShape output_shape = - make_tensor_shape_from_legion_dims({2_n, 20_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({2_n, 20_n}, DataType::FLOAT); run_forward_test(input_shape, index_shape, output_shape); } SUBCASE("test gather forward, 1D") { TensorShape input_shape = - make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); TensorShape index_shape = - make_tensor_shape_from_legion_dims({10_n}, DataType::INT32); + make_tensor_shape_from_ff_ordered({10_n}, DataType::INT32); TensorShape output_shape = - make_tensor_shape_from_legion_dims({10_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({10_n}, DataType::FLOAT); run_forward_test(input_shape, index_shape, output_shape); } } @@ -77,11 +77,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("test gather backward, 2D") { TensorShape input_shape = - make_tensor_shape_from_legion_dims({2_n, 100_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({2_n, 100_n}, DataType::FLOAT); TensorShape index_shape = - make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::INT32); + make_tensor_shape_from_ff_ordered({2_n, 25_n}, DataType::INT32); TensorShape output_shape = - make_tensor_shape_from_legion_dims({2_n, 25_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({2_n, 25_n}, DataType::FLOAT); run_backward_test(input_shape, index_shape, output_shape); } } diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index c1be78bd16..e9fab697bb 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_handle.raw_handle(), DataType::FLOAT); TensorShape input_shape = - make_tensor_shape_from_legion_dims({10_n, 10_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({10_n, 10_n}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { diff --git a/lib/kernels/test/src/test_perf_metrics.cc b/lib/kernels/test/src/test_perf_metrics.cc new file mode 100644 index 0000000000..e958a808b7 --- /dev/null +++ b/lib/kernels/test/src/test_perf_metrics.cc @@ -0,0 +1,127 @@ +#include "kernels/perf_metrics.h" +#include "doctest/doctest.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test PerfMetrics Constructors and Metric Functions") { + SUBCASE("Test constructor with start_time only") { + double start = 100.0; + PerfMetrics pm(start); + + CHECK(pm.start_time == start); + CHECK(pm.current_time == start); + + CHECK(pm.train_all == 0); + if (pm.train_correct.has_value()) { + CHECK(pm.train_correct.value() == 0); + } + + CHECK(!pm.cce_loss.has_value()); + + if (pm.sparse_cce_loss.has_value()) { + CHECK(pm.sparse_cce_loss.value() == doctest::Approx(0.0f)); + } + if (pm.mse_loss.has_value()) { + CHECK(pm.mse_loss.value() == doctest::Approx(0.0f)); + } + if (pm.rmse_loss.has_value()) { + CHECK(pm.rmse_loss.value() == doctest::Approx(0.0f)); + } + if (pm.mae_loss.has_value()) { + CHECK(pm.mae_loss.value() == doctest::Approx(0.0f)); + } + } + + SUBCASE("Test full constructor and throughput/accuracy") { + int train_all = 200; + int train_correct = 150; + float cce = 1.2f; + float sparse_cce = 1.0f; + float mse = 0.5f; + float rmse = 0.7f; + float mae = 0.3f; + double start = 100.0; + double curr = 110.0; + PerfMetrics pm(train_all, + train_correct, + cce, + sparse_cce, + mse, + rmse, + mae, + start, + curr); + + CHECK(pm.train_all == train_all); + CHECK(pm.train_correct.has_value()); + CHECK(pm.train_correct.value() == train_correct); + CHECK(pm.cce_loss.has_value()); + CHECK(pm.cce_loss.value() == doctest::Approx(cce)); + CHECK(pm.sparse_cce_loss.has_value()); + CHECK(pm.sparse_cce_loss.value() == doctest::Approx(sparse_cce)); + CHECK(pm.mse_loss.has_value()); + CHECK(pm.mse_loss.value() == doctest::Approx(mse)); + CHECK(pm.rmse_loss.has_value()); + CHECK(pm.rmse_loss.value() == doctest::Approx(rmse)); + CHECK(pm.mae_loss.has_value()); + CHECK(pm.mae_loss.value() == doctest::Approx(mae)); + CHECK(pm.start_time == start); + CHECK(pm.current_time == curr); + + float expected_throughput = train_all / (curr - start); + CHECK(get_throughput(pm) == doctest::Approx(expected_throughput)); + + float expected_accuracy = static_cast(train_correct) / train_all; + CHECK(get_accuracy(pm) == doctest::Approx(expected_accuracy)); + } + + SUBCASE("Test update function") { + PerfMetrics pm1(100, 50, 1.0f, 0.5f, 0.3f, 0.2f, 0.1f, 0.0, 1.0); + PerfMetrics pm2(50, 30, 0.5f, 0.3f, 0.2f, 0.1f, 0.05f, 0.0, 1.5); + + PerfMetrics updated = update(pm1, pm2); + + CHECK(updated.train_all == (100 + 50)); + if (updated.train_correct.has_value()) { + CHECK(updated.train_correct.value() == (50 + 30)); + } + + CHECK(updated.cce_loss.has_value()); + CHECK(updated.cce_loss.value() == doctest::Approx(1.0f + 0.5f)); + CHECK(updated.sparse_cce_loss.has_value()); + CHECK(updated.sparse_cce_loss.value() == doctest::Approx(0.5f + 0.3f)); + CHECK(updated.mse_loss.has_value()); + CHECK(updated.mse_loss.value() == doctest::Approx(0.3f + 0.2f)); + CHECK(updated.rmse_loss.has_value()); + CHECK(updated.rmse_loss.value() == doctest::Approx(0.2f + 0.1f)); + CHECK(updated.mae_loss.has_value()); + CHECK(updated.mae_loss.value() == doctest::Approx(0.1f + 0.05f)); + CHECK(updated.current_time == pm2.current_time); + } + + SUBCASE("Test apply_scale function") { + PerfMetrics pm(100, 50, 2.0f, 1.0f, 0.8f, 0.6f, 0.4f, 0.0, 2.0); + float scale = 0.5f; + PerfMetrics scaled = apply_scale(pm, scale); + + CHECK(scaled.cce_loss.has_value()); + CHECK(scaled.cce_loss.value() == doctest::Approx(2.0f * scale)); + CHECK(scaled.sparse_cce_loss.has_value()); + CHECK(scaled.sparse_cce_loss.value() == doctest::Approx(1.0f * scale)); + CHECK(scaled.mse_loss.has_value()); + CHECK(scaled.mse_loss.value() == doctest::Approx(0.8f * scale)); + CHECK(scaled.rmse_loss.has_value()); + CHECK(scaled.rmse_loss.value() == doctest::Approx(0.6f * scale)); + CHECK(scaled.mae_loss.has_value()); + CHECK(scaled.mae_loss.value() == doctest::Approx(0.4f * scale)); + + CHECK(scaled.train_all == pm.train_all); + if (scaled.train_correct.has_value()) { + CHECK(scaled.train_correct.value() == pm.train_correct.value()); + } + CHECK(scaled.start_time == pm.start_time); + CHECK(scaled.current_time == pm.current_time); + } + } +} diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index ff74f6fb28..06db1989eb 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -49,9 +49,9 @@ TEST_SUITE(FF_TEST_SUITE) { /*stride_w=*/stride_w.unwrap_nonnegative(), /*pool_type=*/pool_type); - TensorShape input_shape = make_tensor_shape_from_legion_dims( + TensorShape input_shape = make_tensor_shape_from_ff_ordered( {input_w, input_h, input_c, input_n}, DataType::FLOAT); - TensorShape output_shape = make_tensor_shape_from_legion_dims( + TensorShape output_shape = make_tensor_shape_from_ff_ordered( {output_w, output_h, output_c, output_n}, DataType::FLOAT); GenericTensorAccessorW input_accessor = diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index f91c4959cc..921a5ff08c 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -8,7 +8,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Reduction Forward and Backward Kernel") { std::size_t num_replicas = 5; - TensorShape input_shape = make_tensor_shape_from_legion_dims( + TensorShape input_shape = make_tensor_shape_from_ff_ordered( {10_n, 10_n, 10_n, 10_n, 10_n}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{ @@ -20,7 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { TensorShape output_shape = - make_tensor_shape_from_legion_dims({10_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({10_n}, DataType::FLOAT); GenericTensorAccessorR input_accessor = create_random_filled_accessor_r(input_shape, allocator); diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 87834d83ac..6009b3c501 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -9,9 +9,9 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int num_replicas = 10_n; TensorShape input_shape = - make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{ /*workSpaceSize=*/1024 * 1024, @@ -51,8 +51,8 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int num_replicas = 2_n; TensorShape input_shape = - make_tensor_shape_from_legion_dims({5_n}, DataType::FLOAT); - TensorShape output_shape = make_tensor_shape_from_legion_dims( + make_tensor_shape_from_ff_ordered({5_n}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_ff_ordered( {num_replicas, 5_n}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{ @@ -82,7 +82,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, output_accessor_cpu); - CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); + CHECK(accessor_data_is_equal(output_accessor_gpu, output_accessor_cpu)); } SUBCASE("backward_kernel") { @@ -108,7 +108,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor_cpu, num_replicas.unwrap_nonnegative()); - CHECK(accessors_are_equal(input_grad_accessor_gpu, + CHECK(accessor_data_is_equal(input_grad_accessor_gpu, input_grad_accessor_cpu)); } } diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index ee7530c017..fa67953947 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); TensorShape output_shape = input_shape; ReshapePerDeviceState state = diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 481958fdfc..78ee803da6 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -11,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int reverse_dim_size = 10_n; nonnegative_int in_blk_size = 10_n; - TensorShape input_shape = make_tensor_shape_from_legion_dims( + TensorShape input_shape = make_tensor_shape_from_ff_ordered( {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; @@ -65,7 +65,7 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int reverse_dim_size = 4_n; nonnegative_int in_blk_size = 3_n; - TensorShape input_shape = make_tensor_shape_from_legion_dims( + TensorShape input_shape = make_tensor_shape_from_ff_ordered( {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; @@ -106,7 +106,7 @@ TEST_SUITE(FF_TEST_SUITE) { reverse_dim_size.unwrap_nonnegative(), in_blk_size.unwrap_nonnegative()); - CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu)); + CHECK(accessor_data_is_equal(output_accessor_cpu, output_accessor_cpu)); } SUBCASE("backward_kernel") { @@ -139,7 +139,7 @@ TEST_SUITE(FF_TEST_SUITE) { reverse_dim_size.unwrap_nonnegative(), in_blk_size.unwrap_nonnegative()); - CHECK(accessors_are_equal(input_grad_accessor_gpu, + CHECK(accessor_data_is_equal(input_grad_accessor_gpu, input_grad_accessor_cpu)); } } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index d4fb496f7b..ecb996227f 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -20,7 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); TensorShape output_shape = input_shape; SoftmaxPerDeviceState state = diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index d98f88a30e..20a6898896 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -21,9 +21,9 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({50_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({50_n}, DataType::FLOAT); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index cac43c6ff3..ac8876ac98 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -20,7 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({10_n, 10_n}, DataType::FLOAT); + make_tensor_shape_from_ff_ordered({10_n, 10_n}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index bc5f48654a..e335e5b449 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -1,12 +1,13 @@ #include "test_utils.h" +#include "kernels/datatype_dispatch.h" #include "op-attrs/tensor_shape.h" #include "utils/join_strings.h" #include namespace FlexFlow { -TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims, - DataType DT) { +TensorShape make_tensor_shape_from_ff_ordered(FFOrdered dims, + DataType DT) { return TensorShape{ TensorDims{ dims, @@ -128,26 +129,6 @@ bool contains_non_zero(GenericTensorAccessorR const &accessor) { cpu_accessor.data_type, cpu_accessor); } -GenericTensorAccessorR - copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, - Allocator &cpu_allocator) { - GenericTensorAccessorR cpu_accessor = accessor; - if (accessor.device_type == DeviceType::GPU) { - cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator); - } - return cpu_accessor; -} - -GenericTensorAccessorW - copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor, - Allocator &cpu_allocator) { - GenericTensorAccessorW cpu_accessor = accessor; - if (accessor.device_type == DeviceType::GPU) { - cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator); - } - return cpu_accessor; -} - template struct Print2DCPUAccessorR { void operator()(GenericTensorAccessorR const &accessor, @@ -179,44 +160,6 @@ void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor, accessor.data_type, cpu_accessor, stream); } -template -struct AccessorsAreEqual { - bool operator()(GenericTensorAccessorR const &accessor_a, - GenericTensorAccessorR const &accessor_b) { - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorR cpu_accessor_a = - copy_accessor_r_to_cpu_if_necessary(accessor_a, cpu_allocator); - GenericTensorAccessorR cpu_accessor_b = - copy_accessor_r_to_cpu_if_necessary(accessor_b, cpu_allocator); - - using T = real_type_t
; - T const *a_data_ptr = cpu_accessor_a.get
(); - T const *b_data_ptr = cpu_accessor_b.get
(); - - int volume = accessor_a.shape.num_elements().unwrap_nonnegative(); - for (size_t i = 0; i < volume; i++) { - if (a_data_ptr[i] != b_data_ptr[i]) { - return false; - } - } - - return true; - } -}; - -bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, - GenericTensorAccessorR const &accessor_b) { - if (accessor_a.shape != accessor_b.shape) { - throw mk_runtime_error( - fmt::format("accessors_are_equal expected accessors to have the same " - "shape, but received: {} != {}", - accessor_a.shape, - accessor_b.shape)); - } - return DataTypeDispatch1{}( - accessor_a.data_type, accessor_a, accessor_b); -} - template struct CreateFilledAccessorW { GenericTensorAccessorW operator()(TensorShape const &shape, diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index 093a9a4a97..2e7294ed1d 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -2,7 +2,6 @@ #define _FLEXFLOW_KERNELS_TEST_UTILS #include "kernels/copy_tensor_accessor.h" -#include "kernels/datatype_dispatch.h" #include "kernels/device.h" #include "kernels/local_cpu_allocator.h" #include "kernels/local_cuda_allocator.h" @@ -29,27 +28,16 @@ GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape, Allocator &allocator); -TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims, - DataType DT); +TensorShape make_tensor_shape_from_ff_ordered(FFOrdered dims, + DataType DT); bool contains_non_zero(GenericTensorAccessorR const &accessor); void fill_with_zeros(GenericTensorAccessorW const &accessor); -GenericTensorAccessorW - copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor, - Allocator &allocator); - -GenericTensorAccessorR - copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, - Allocator &allocator); - void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor, std::ostream &stream); -bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, - GenericTensorAccessorR const &accessor_b); - GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, Allocator &allocator, DataTypeValue val); From 4fc04751c7b5550f19da89ac50a15ae8ad8ca1ee Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 24 Feb 2025 19:40:45 -0800 Subject: [PATCH 41/42] remove . files --- .envrc | 3 --- .proj.toml | 1 - .vimrc | 8 -------- 3 files changed, 12 deletions(-) delete mode 100644 .envrc delete mode 100644 .vimrc diff --git a/.envrc b/.envrc deleted file mode 100644 index 2797f0f929..0000000000 --- a/.envrc +++ /dev/null @@ -1,3 +0,0 @@ -source_up_if_exists - -use flake diff --git a/.proj.toml b/.proj.toml index b3b90bbada..10307a6efa 100644 --- a/.proj.toml +++ b/.proj.toml @@ -15,7 +15,6 @@ build_targets = [ "models", "export-model-arch", "substitution-to-dot", - "kernels-tests", ] test_targets = [ diff --git a/.vimrc b/.vimrc deleted file mode 100644 index 4c8a8a8279..0000000000 --- a/.vimrc +++ /dev/null @@ -1,8 +0,0 @@ -" example search path configuration -set path=lib/runtime/**,lib/** - -" set build target -" let g:target = "pcg" - -" set test target -" let g:test_target = "utils-test" From 8b72dcd360c5daa1391609b84eba12e3445d8383 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 24 Feb 2025 19:51:41 -0800 Subject: [PATCH 42/42] format issues --- lib/kernels/src/perf_metrics.cc | 4 ++-- lib/kernels/test/src/test_combine_kernel.cc | 2 +- lib/kernels/test/src/test_local_cpu_allocator.cc | 2 +- lib/kernels/test/src/test_local_cuda_allocator.cc | 2 +- lib/kernels/test/src/test_perf_metrics.cc | 6 +++--- lib/kernels/test/src/test_replicate_kernel.cc | 6 +++--- lib/kernels/test/src/test_reverse_kernels.cc | 2 +- 7 files changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/kernels/src/perf_metrics.cc b/lib/kernels/src/perf_metrics.cc index ab0e113a26..61163caeae 100644 --- a/lib/kernels/src/perf_metrics.cc +++ b/lib/kernels/src/perf_metrics.cc @@ -15,8 +15,8 @@ PerfMetrics::PerfMetrics(int _train_all, double _start_time_micro, double _current_time_micro) : train_all(_train_all), train_correct(_train_correct), cce_loss(_cce_loss), - sparse_cce_loss(_sparse_cce_loss), mse_loss(_mse_loss), - rmse_loss(_rmse_loss), mae_loss(_mae_loss), start_time(_start_time_micro), + sparse_cce_loss(_sparse_cce_loss), mse_loss(_mse_loss), + rmse_loss(_rmse_loss), mae_loss(_mae_loss), start_time(_start_time_micro), current_time(_current_time_micro) {} float get_throughput(PerfMetrics const &m) { diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index b30d1ab7f4..3a7a70c862 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -95,7 +95,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu, input_grad_accessor_cpu); - CHECK(accessor_data_is_equal(input_grad_accessor_gpu, + CHECK(accessor_data_is_equal(input_grad_accessor_gpu, input_grad_accessor_cpu)); } } diff --git a/lib/kernels/test/src/test_local_cpu_allocator.cc b/lib/kernels/test/src/test_local_cpu_allocator.cc index fa6bce36db..d5552e4cb0 100644 --- a/lib/kernels/test/src/test_local_cpu_allocator.cc +++ b/lib/kernels/test/src/test_local_cpu_allocator.cc @@ -1,5 +1,5 @@ -#include "kernels/local_cpu_allocator.h" #include "doctest/doctest.h" +#include "kernels/local_cpu_allocator.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { diff --git a/lib/kernels/test/src/test_local_cuda_allocator.cc b/lib/kernels/test/src/test_local_cuda_allocator.cc index c091576bd3..7c3e62dbeb 100644 --- a/lib/kernels/test/src/test_local_cuda_allocator.cc +++ b/lib/kernels/test/src/test_local_cuda_allocator.cc @@ -1,5 +1,5 @@ -#include "kernels/local_cuda_allocator.h" #include "doctest/doctest.h" +#include "kernels/local_cuda_allocator.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { diff --git a/lib/kernels/test/src/test_perf_metrics.cc b/lib/kernels/test/src/test_perf_metrics.cc index e958a808b7..045788bae3 100644 --- a/lib/kernels/test/src/test_perf_metrics.cc +++ b/lib/kernels/test/src/test_perf_metrics.cc @@ -1,5 +1,5 @@ -#include "kernels/perf_metrics.h" #include "doctest/doctest.h" +#include "kernels/perf_metrics.h" using namespace ::FlexFlow; @@ -16,9 +16,9 @@ TEST_SUITE(FF_TEST_SUITE) { if (pm.train_correct.has_value()) { CHECK(pm.train_correct.value() == 0); } - + CHECK(!pm.cce_loss.has_value()); - + if (pm.sparse_cce_loss.has_value()) { CHECK(pm.sparse_cce_loss.value() == doctest::Approx(0.0f)); } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 6009b3c501..b2c8ea0c19 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -52,8 +52,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = make_tensor_shape_from_ff_ordered({5_n}, DataType::FLOAT); - TensorShape output_shape = make_tensor_shape_from_ff_ordered( - {num_replicas, 5_n}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_ff_ordered({num_replicas, 5_n}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{ /*workSpaceSize=*/1024 * 1024, @@ -109,7 +109,7 @@ TEST_SUITE(FF_TEST_SUITE) { num_replicas.unwrap_nonnegative()); CHECK(accessor_data_is_equal(input_grad_accessor_gpu, - input_grad_accessor_cpu)); + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 78ee803da6..01eded4297 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -140,7 +140,7 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size.unwrap_nonnegative()); CHECK(accessor_data_is_equal(input_grad_accessor_gpu, - input_grad_accessor_cpu)); + input_grad_accessor_cpu)); } } }