diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index 8ccd7c1011..f5d88f102f 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -7,8 +7,7 @@ file(GLOB_RECURSE SRC CONFIGURE_DEPENDS LIST_DIRECTORIES False src/*.cc - src/cuda/cuda_helper.cu - src/cuda/ops/*.cu + src/cuda/*.cu ) add_library( @@ -30,6 +29,7 @@ target_link_libraries( cudnn nccl utils + pcg ) define_ff_vars(${project_target}) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 39da65c3be..8bbcf3ef95 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -5,11 +5,102 @@ #include "device.h" #include "kernels/ff_handle.h" #include "op-attrs/datatype.h" +#include "pcg/device_type.dtg.h" #include "utils/exception.h" #include "utils/required.h" namespace FlexFlow { +inline int calculate_accessor_offset(std::vector const &indices, + ArrayShape const &shape) { + int offset = 0; + int multiplier = 1; + + for (int i = 0; i < shape.num_dims(); i++) { + if (indices.at(i) >= shape.at(legion_dim_t{nonnegative_int{i}})) { + throw mk_runtime_error( + fmt::format("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + indices.at(i), + shape.at(legion_dim_t{nonnegative_int{i}}))); + } + + offset += indices.at(i) * multiplier; + multiplier *= + shape.at(legion_dim_t{nonnegative_int{i}}).unwrap_nonnegative(); + } + + return offset; +} + +class GenericTensorAccessorR { +public: + template + typename data_type_enum_to_class
::type const *get() const { + if (this->data_type == DT) { + return static_cast const *>(this->ptr); + } else { + throw mk_runtime_error(fmt::format( + "Invalid access data type ({} != {})", this->data_type, DT)); + } + } + + int32_t const *get_int32_ptr() const; + int64_t const *get_int64_ptr() const; + float const *get_float_ptr() const; + double const *get_double_ptr() const; + half const *get_half_ptr() const; + + GenericTensorAccessorR() = delete; + + GenericTensorAccessorR(DataType data_type, + ArrayShape const &shape, + void const *ptr, + DeviceType device_type); + + bool operator==(GenericTensorAccessorR const &) const; + bool operator!=(GenericTensorAccessorR const &) const; + + template + real_type_t
const &at(std::vector const &indices) const { + if (this->device_type != DeviceType::CPU) { + throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); + } + if (this->data_type != DT) { + throw mk_runtime_error(fmt::format( + "Invalid access data type ({} != {})", this->data_type, DT)); + } + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error(fmt::format("Number of indices ({}) does not " + "match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims())); + } + + using T = real_type_t
; + T const *data_ptr = static_cast(this->ptr); + int offset = calculate_accessor_offset(indices, this->shape); + return data_ptr[offset]; + } + +public: + DataType data_type; + ArrayShape shape; + void const *ptr; + DeviceType device_type; + +private: + std::tuple + tie() const; +}; + +std::string format_as(GenericTensorAccessorR const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); + class GenericTensorAccessorW { public: template @@ -28,64 +119,78 @@ class GenericTensorAccessorW { double *get_double_ptr() const; half *get_half_ptr() const; -public: - DataType data_type; - ArrayShape shape; - req ptr; -}; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW, - data_type, - shape, - ptr); + GenericTensorAccessorW() = delete; -std::string format_as(GenericTensorAccessorW const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); + GenericTensorAccessorW(DataType data_type, + ArrayShape const &shape, + void *ptr, + DeviceType device_type); + + bool operator==(GenericTensorAccessorW const &) const; + bool operator!=(GenericTensorAccessorW const &) const; + + operator GenericTensorAccessorR() const; -class GenericTensorAccessorR { -public: template - typename data_type_enum_to_class
::type const *get() const { - if (this->data_type == DT) { - return static_cast const *>(this->ptr); - } else { + real_type_t
&at(std::vector const &indices) { + if (this->device_type != DeviceType::CPU) { + throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); + } + if (this->data_type != DT) { throw mk_runtime_error(fmt::format( "Invalid access data type ({} != {})", this->data_type, DT)); } + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error(fmt::format("Number of indices ({}) does not " + "match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims())); + } + + using T = real_type_t
; + T *data_ptr = static_cast(this->ptr); + int offset = calculate_accessor_offset(indices, this->shape); + return data_ptr[offset]; } - int32_t const *get_int32_ptr() const; - int64_t const *get_int64_ptr() const; - float const *get_float_ptr() const; - double const *get_double_ptr() const; - half const *get_half_ptr() const; + template + real_type_t
&at(std::vector const &indices) const { + if (this->device_type != DeviceType::CPU) { + throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); + } + if (this->data_type != DT) { + throw mk_runtime_error(fmt::format( + "Invalid access data type ({} != {})", this->data_type, DT)); + } + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error(fmt::format("Number of indices ({}) does not " + "match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims())); + } + + using T = real_type_t
; + T const *data_ptr = static_cast(this->ptr); + int offset = calculate_accessor_offset(indices, this->shape); + return data_ptr[offset]; + } public: DataType data_type; ArrayShape shape; - req ptr; -}; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR, - data_type, - shape, - ptr); + void *ptr; + DeviceType device_type; -std::string format_as(GenericTensorAccessorR const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); +private: + std::tuple + tie() const; +}; -int32_t *get_int32_ptr(GenericTensorAccessorW const &); -int64_t *get_int64_ptr(GenericTensorAccessorW const &); -float *get_float_ptr(GenericTensorAccessorW const &); -double *get_double_ptr(GenericTensorAccessorW const &); -half *get_half_ptr(GenericTensorAccessorW const &); -std::vector - get_int32_ptrs(std::vector const &); -std::vector - get_int64_ptrs(std::vector const &); -std::vector - get_float_ptrs(std::vector const &); -std::vector - get_double_ptrs(std::vector const &); -std::vector get_half_ptrs(std::vector const &); +std::string format_as(GenericTensorAccessorW const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); static_assert(is_fmtable const &>::value, ""); @@ -137,6 +242,21 @@ std::vector std::vector get_half_ptrs(std::vector const &); +int32_t *get_int32_ptr(GenericTensorAccessorW const &); +int64_t *get_int64_ptr(GenericTensorAccessorW const &); +float *get_float_ptr(GenericTensorAccessorW const &); +double *get_double_ptr(GenericTensorAccessorW const &); +half *get_half_ptr(GenericTensorAccessorW const &); +std::vector + get_int32_ptrs(std::vector const &); +std::vector + get_int64_ptrs(std::vector const &); +std::vector + get_float_ptrs(std::vector const &); +std::vector + get_double_ptrs(std::vector const &); +std::vector get_half_ptrs(std::vector const &); + template std::vector const *> get(std::vector const &accs) { @@ -147,15 +267,17 @@ std::vector const *> return out; } +bool accessor_data_is_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b); + +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b); + GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &write_accessor); -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2); - -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype); +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2); bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, ArrayShape const &expected_shape, @@ -163,8 +285,6 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, std::pair get_shape_and_datatype(GenericTensorAccessorR const &accessor); -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h index 6500899394..4bf97118ce 100644 --- a/lib/kernels/include/kernels/allocation.h +++ b/lib/kernels/include/kernels/allocation.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_ALLOCATION_H #define _FLEXFLOW_KERNELS_ALLOCATION_H -#include "accessor.h" +#include "kernels/accessor.h" #include #include @@ -11,6 +11,8 @@ struct IAllocator { virtual void *allocate(size_t) = 0; virtual void deallocate(void *) = 0; + virtual DeviceType get_allocation_device_type() const = 0; + virtual ~IAllocator() = default; }; @@ -18,9 +20,12 @@ struct Allocator { Allocator() = delete; GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape); + void *allocate(size_t mem_size); void deallocate(void *ptr); + DeviceType get_allocation_device_type() const; + template static typename std::enable_if::value, Allocator>::type diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 57498ee466..72c746b8cc 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -15,7 +15,7 @@ namespace FlexFlow { struct ArrayShape { public: ArrayShape() = delete; - ArrayShape(nonnegative_int *dims, nonnegative_int num_dims); + ArrayShape(nonnegative_int const *dims, nonnegative_int num_dims); ArrayShape(TensorShape const &shape); ArrayShape(std::vector const &); diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h index eb5a1b8198..1e483102dd 100644 --- a/lib/kernels/include/kernels/attention_kernels.h +++ b/lib/kernels/include/kernels/attention_kernels.h @@ -64,8 +64,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState, std::string format_as(MHAPerDeviceState const &x); std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x); -namespace Kernels { -namespace MultiHeadAttention { +namespace Kernels::MultiHeadAttention { MHAPerDeviceState init_kernel(PerDeviceFFHandle const &, Allocator &, @@ -105,8 +104,7 @@ void backward_kernel(ffStream_t stream, void cleanup_kernel(Allocator &allocator, MHAPerDeviceState const &device_state); -} // namespace MultiHeadAttention -} // namespace Kernels +} // namespace Kernels::MultiHeadAttention } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h index bfd72647b0..bde91bea15 100644 --- a/lib/kernels/include/kernels/batch_matmul_kernels.h +++ b/lib/kernels/include/kernels/batch_matmul_kernels.h @@ -5,9 +5,7 @@ #include "kernels/allocation.h" #include "kernels/ff_handle.h" -namespace FlexFlow { -namespace Kernels { -namespace BatchMatmul { +namespace FlexFlow::Kernels::BatchMatmul { void forward_kernel(ffStream_t stream, PerDeviceFFHandle const &handle, @@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream, int k, int batch); -} // namespace BatchMatmul -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::BatchMatmul #endif diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h index f2ca17f429..90202592a7 100644 --- a/lib/kernels/include/kernels/batch_norm_kernels.h +++ b/lib/kernels/include/kernels/batch_norm_kernels.h @@ -7,9 +7,7 @@ #include "kernels/ff_handle.h" #include -namespace FlexFlow { -namespace Kernels { -namespace BatchNorm { +namespace FlexFlow::Kernels::BatchNorm { BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle, Allocator allocator, @@ -29,9 +27,9 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, BatchNormPerDeviceState const &per_device_state, - float const *input_ptr, - float *output_grad_ptr, float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, float *input_grad_ptr, float const *scale_ptr, float *scale_grad_ptr, @@ -46,8 +44,5 @@ void cleanup_kernel(Allocator allocator, bool relu, float *runningMean); -} // namespace BatchNorm -} // namespace Kernels -} // namespace FlexFlow - +} // namespace FlexFlow::Kernels::BatchNorm #endif diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h index 96f9aadd52..da13e0036d 100644 --- a/lib/kernels/include/kernels/cast_kernels.h +++ b/lib/kernels/include/kernels/cast_kernels.h @@ -3,27 +3,17 @@ #include "device.h" #include "kernels/accessor.h" -#include "kernels/ff_handle.h" -#include "op-attrs/activation.dtg.h" -namespace FlexFlow { -namespace Kernels { -namespace Cast { +namespace FlexFlow::Kernels::Cast { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); -} // namespace Cast -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Cast #endif diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h new file mode 100644 index 0000000000..a5df80d4da --- /dev/null +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H + +#include "device.h" +#include "kernels/accessor.h" + +namespace FlexFlow::Kernels::Cast { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); + +} // namespace FlexFlow::Kernels::Cast + +#endif diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h index eb263e0734..50de18e823 100644 --- a/lib/kernels/include/kernels/combine_kernels.h +++ b/lib/kernels/include/kernels/combine_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Combine { +namespace FlexFlow::Kernels::Combine { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad); -} // namespace Combine -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Combine #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h new file mode 100644 index 0000000000..430c7cf906 --- /dev/null +++ b/lib/kernels/include/kernels/combine_kernels_cpu.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H + +#include "device.h" +#include "kernels/accessor.h" + +namespace FlexFlow::Kernels::Combine { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); + +} // namespace FlexFlow::Kernels::Combine + +#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h index a44affc1f2..33355296dd 100644 --- a/lib/kernels/include/kernels/concat_kernels.h +++ b/lib/kernels/include/kernels/concat_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Concat { +namespace FlexFlow::Kernels::Concat { void forward_kernel(ffStream_t stream, GenericTensorAccessorW const &output, @@ -18,8 +16,6 @@ void backward_kernel(ffStream_t stream, std::vector const &input_grads, ff_dim_t axis); -} // namespace Concat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Concat #endif diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index cfc64f963d..f49c8f50f4 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -34,8 +34,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState, bwdFilterAlgo, bwdDataAlgo); -namespace Kernels { -namespace Conv2D { +namespace Kernels::Conv2D { Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, std::optional activation, @@ -61,17 +60,16 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *filter_ptr, float *filter_grad_ptr, float *bias_grad_ptr, std::optional activation); -} // namespace Conv2D -} // namespace Kernels +} // namespace Kernels::Conv2D } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h new file mode 100644 index 0000000000..97b6254750 --- /dev/null +++ b/lib/kernels/include/kernels/copy_tensor_accessor.h @@ -0,0 +1,30 @@ +#ifndef _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H +#define _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H + +#include "kernels/accessor.h" +#include "kernels/allocation.h" + +namespace FlexFlow { + +void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor); + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator); + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator); + +GenericTensorAccessorW + copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor, + Allocator &allocator); + +GenericTensorAccessorR + copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, + Allocator &allocator); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h index e83fc3325d..50ca66a820 100644 --- a/lib/kernels/include/kernels/datatype_dispatch.h +++ b/lib/kernels/include/kernels/datatype_dispatch.h @@ -1,7 +1,8 @@ #ifndef _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H #define _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H -#include "accessor.h" +#include "op-attrs/datatype.h" +#include "utils/exception.h" namespace FlexFlow { @@ -33,7 +34,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(Args... args) const { + Out operator()(Args &&...args) const { return F
{}(std::forward(args)...); } }; @@ -41,7 +42,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(DataType data_type, Args... args) { + Out operator()(DataType data_type, Args &&...args) { return dispatch(data_type, std::forward(args)...); } }; @@ -54,13 +55,13 @@ struct DataTypeDispatch2 { template struct OutputType { template - void operator()(Args... args) const { + void operator()(Args &&...args) const { F{}(std::forward(args)...); } }; template - void operator()(DataType output_type, Args... args) const { + void operator()(DataType output_type, Args &&...args) const { dispatch(output_type, std::forward(args)...); } }; @@ -68,7 +69,7 @@ struct DataTypeDispatch2 { template void operator()(DataType input_data_type, DataType output_data_type, - Args... args) { + Args &&...args) { dispatch( input_data_type, output_data_type, std::forward(args)...); } diff --git a/lib/kernels/include/kernels/dropout_kernels.h b/lib/kernels/include/kernels/dropout_kernels.h index c0e503be5b..4790540098 100644 --- a/lib/kernels/include/kernels/dropout_kernels.h +++ b/lib/kernels/include/kernels/dropout_kernels.h @@ -31,8 +31,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(DropoutPerDeviceState, reserveSpaceSize, dropoutStateSize); -namespace Kernels { -namespace Dropout { +namespace Kernels::Dropout { DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle, float rate, @@ -56,8 +55,7 @@ void cleanup_kernel(Allocator allocator, ffDropoutDescriptor_t dropoutDesc, void *dropoutStates); -} // namespace Dropout -} // namespace Kernels +} // namespace Kernels::Dropout } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H diff --git a/lib/kernels/include/kernels/element_binary_kernels.h b/lib/kernels/include/kernels/element_binary_kernels.h index 41447e98e6..1017230fb0 100644 --- a/lib/kernels/include/kernels/element_binary_kernels.h +++ b/lib/kernels/include/kernels/element_binary_kernels.h @@ -26,8 +26,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ElementBinaryPerDeviceState, opDesc, reduceAddDesc); -namespace Kernels { -namespace ElementBinary { +namespace Kernels::ElementBinary { ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle, OperatorType op_type, @@ -58,8 +57,7 @@ void backward_kernel(ffStream_t stream, bool broadcast_inputRHS, PerDeviceFFHandle handle); -} // namespace ElementBinary -} // namespace Kernels +} // namespace Kernels::ElementBinary } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h index 8c6864b2d9..c338f465ac 100644 --- a/lib/kernels/include/kernels/element_unary_kernels.h +++ b/lib/kernels/include/kernels/element_unary_kernels.h @@ -19,8 +19,7 @@ FF_VISITABLE_STRUCT_NO_EQ(ElementUnaryPerDeviceState, outputTensor, actiDesc); -namespace Kernels { -namespace ElementUnary { +namespace Kernels::ElementUnary { ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape, ArrayShape const &output_shape, @@ -37,13 +36,12 @@ void backward_kernel(ffStream_t stream, ElementUnaryPerDeviceState const &device_state, ElementUnaryAttrs const &attrs, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad); -} // namespace ElementUnary -} // namespace Kernels +} // namespace Kernels::ElementUnary } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h index 06582ca1d5..f5b2561b56 100644 --- a/lib/kernels/include/kernels/embedding_kernels.h +++ b/lib/kernels/include/kernels/embedding_kernels.h @@ -5,9 +5,7 @@ #include "kernels/accessor.h" #include "op-attrs/ops/embedding.h" -namespace FlexFlow { -namespace Kernels { -namespace Embedding { +namespace FlexFlow::Kernels::Embedding { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, @@ -19,11 +17,11 @@ void forward_kernel(ffStream_t stream, int out_dim, int batch_size); void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, - DataType input_data_type, DataType output_data_type, + DataType input_data_type, std::optional aggr, int in_dim, int out_dim, @@ -35,8 +33,6 @@ void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p); template __global__ void rand_generate_int(TD *ptr, size_t size, TD p); -} // namespace Embedding -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Embedding #endif // _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h index 3e600c48de..54839bd7fa 100644 --- a/lib/kernels/include/kernels/flat_kernels.h +++ b/lib/kernels/include/kernels/flat_kernels.h @@ -4,20 +4,17 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Flat { +namespace FlexFlow::Kernels::Flat { void forward_kernel(ffStream_t stream, GenericTensorAccessorR input, float *output_ptr); + void backward_kernel(ffStream_t stream, GenericTensorAccessorR input, - float *input_grad_ptr, - float const *output_grad_ptr); + float const *output_grad_ptr, + float *input_grad_ptr); -} // namespace Flat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Flat #endif // _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h index 13bf4b898a..af2da3b11f 100644 --- a/lib/kernels/include/kernels/gather_kernels.h +++ b/lib/kernels/include/kernels/gather_kernels.h @@ -15,8 +15,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState, handle, legion_dim); -namespace Kernels { -namespace Gather { +namespace Kernels::Gather { void forward_kernel(ffStream_t stream, GatherPerDeviceState const &m, @@ -30,8 +29,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &index, GenericTensorAccessorW const &input_grad); -} // namespace Gather -} // namespace Kernels +} // namespace Kernels::Gather } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h index be13d32879..a6ae87442a 100644 --- a/lib/kernels/include/kernels/layer_norm_kernels.h +++ b/lib/kernels/include/kernels/layer_norm_kernels.h @@ -30,8 +30,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState, bias, data_type); -namespace Kernels { -namespace LayerNorm { +namespace Kernels::LayerNorm { // todo: this may have some problem. LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle, @@ -57,8 +56,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorW const &gamma_grad, GenericTensorAccessorW const &beta_grad); -} // namespace LayerNorm -} // namespace Kernels +} // namespace Kernels::LayerNorm } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h index 3128e39fd0..cd581b0a25 100644 --- a/lib/kernels/include/kernels/linear_kernels.h +++ b/lib/kernels/include/kernels/linear_kernels.h @@ -33,8 +33,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LinearPerDeviceState, weight_type, output_type); -namespace Kernels { -namespace Linear { +namespace Kernels::Linear { LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, float *one_ptr, @@ -51,29 +50,28 @@ bool use_activation(Activation activation); void forward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *output_ptr, - void const *filter_ptr, - void const *bias_ptr, + float const *input_ptr, + float *output_ptr, + float const *filter_ptr, + float const *bias_ptr, int in_dim, int out_dim, int batch_size); void backward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, - void const *output_ptr, - void *output_grad_ptr, - void const *kernel_ptr, - void *kernel_grad_ptr, - void *bias_ptr, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_ptr, int in_dim, int out_dim, int batch_size); -} // namespace Linear -} // namespace Kernels +} // namespace Kernels::Linear } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h new file mode 100644 index 0000000000..cf6cfe35d1 --- /dev/null +++ b/lib/kernels/include/kernels/local_cpu_allocator.h @@ -0,0 +1,24 @@ +#include "kernels/allocation.h" +#include + +namespace FlexFlow { + +struct LocalCPUAllocator : public IAllocator { + LocalCPUAllocator() = default; + LocalCPUAllocator(LocalCPUAllocator const &) = delete; + LocalCPUAllocator(LocalCPUAllocator &&) = delete; + ~LocalCPUAllocator() = default; + + void *allocate(size_t) override; + void deallocate(void *) override; + + DeviceType get_allocation_device_type() const override; + +private: + std::unordered_map> ptrs; +}; +CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator); + +Allocator create_local_cpu_memory_allocator(); + +} // namespace FlexFlow diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h index 18a4b6e78a..b8e0540974 100644 --- a/lib/kernels/include/kernels/local_cuda_allocator.h +++ b/lib/kernels/include/kernels/local_cuda_allocator.h @@ -12,6 +12,8 @@ struct LocalCudaAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: std::unordered_set ptrs; }; diff --git a/lib/kernels/include/kernels/managed_ff_stream.h b/lib/kernels/include/kernels/managed_ff_stream.h index 2f690b2eb3..7f103ea560 100644 --- a/lib/kernels/include/kernels/managed_ff_stream.h +++ b/lib/kernels/include/kernels/managed_ff_stream.h @@ -19,6 +19,9 @@ struct ManagedFFStream { ffStream_t const &raw_stream() const; +private: + void cleanup(); + private: ffStream_t *stream; }; diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index 0a83a5eecb..9bd9370685 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -7,7 +7,10 @@ namespace FlexFlow { struct ManagedPerDeviceFFHandle { public: - ManagedPerDeviceFFHandle(); + ManagedPerDeviceFFHandle() = delete; + + ManagedPerDeviceFFHandle(size_t workSpaceSize, + bool allowTensorOpMathConversion); ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete; ManagedPerDeviceFFHandle & @@ -21,6 +24,9 @@ struct ManagedPerDeviceFFHandle { PerDeviceFFHandle const &raw_handle() const; +private: + void cleanup(); + private: PerDeviceFFHandle *handle; }; diff --git a/lib/kernels/include/kernels/metrics_kernels.h b/lib/kernels/include/kernels/metrics_kernels.h index e4660808b9..430608db55 100644 --- a/lib/kernels/include/kernels/metrics_kernels.h +++ b/lib/kernels/include/kernels/metrics_kernels.h @@ -1,25 +1,24 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H -#include "perf_metrics.h" +#include "kernels/perf_metrics.h" +#include "pcg/metric_attrs.h" namespace FlexFlow { -void update_metrics_sparse_label_kernel(ffStream_t, - MetricsAttrs const &, - float const *logit_ptr, - int const *label_ptr, - int num_samples, - int num_classes, - PerfMetrics &perf_zc); -void update_metrics_label_kernel(ffStream_t, - MetricsAttrs const &, - float const *logit_ptr, - float const *label_ptr, - int num_samples, - int num_classes, - PerfMetrics &perf_zc); +void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, + int const *label_ptr, + MetricsAttrs const &me, + int num_effective_samples, + int num_classes, + PerfMetrics &perf_zc); +void update_metrics_label_kernel_wrapper(float const *logit_ptr, + float const *label_ptr, + MetricsAttrs const &me, + int num_samples, + int num_classes, + PerfMetrics &perf_zc); } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/nccl.h b/lib/kernels/include/kernels/nccl.h index b8a6784676..042911d172 100644 --- a/lib/kernels/include/kernels/nccl.h +++ b/lib/kernels/include/kernels/nccl.h @@ -23,15 +23,11 @@ struct ncclUniqueId {}; struct ncclComm_t {}; #endif -namespace FlexFlow { -namespace Kernels { -namespace NCCL { +namespace FlexFlow::Kernels::NCCL { ncclUniqueId generate_unique_id(); ncclComm_t create_comm(ncclUniqueId const &, int num_ranks, int my_rank); -} // namespace NCCL -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::NCCL #endif diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h index 9ca6bf8e2b..3b5d292a5f 100644 --- a/lib/kernels/include/kernels/optimizer_kernels.h +++ b/lib/kernels/include/kernels/optimizer_kernels.h @@ -2,53 +2,91 @@ #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H #include "device.h" +#include "kernels/ff_handle.h" +#include "kernels/nccl.h" +#include "kernels/per_device_op_state.dtg.h" namespace FlexFlow { -void sgd_ps_update_task_gpu(ffStream_t, - float lr, - float momentum, - bool nesterov, +__global__ void sgd_update(size_t count, + float lr, + float weight_decay, + float momentum, + bool nesterov, + float const *WGrad, + float *V, + float *W); + +class SGDOptimizer { +public: + static __host__ void ps_update_task_gpu(SGDOptimizer const *op, + float const *w_grad_ptr, + size_t size, + int num_replicas, + float *w_ptr, + float *v_ptr); + +#ifdef FF_USE_NCCL + static __host__ void nccl_update_task_gpu(SGDOptimizer const *op, + PerDeviceOpState const *meta, + float const *w_grad_ptr, + size_t size, + float *w_ptr, + float *v_ptr); +#endif + +public: + float lr; + float weight_decay; + float momentum; + bool nesterov; +}; + +__global__ void + add_kernel(int count, float scale, float const *src, float *dst); + +__global__ void scale_kernel(int count, float a, float b, float *ptr); + +__global__ void adam_update(int count, + float alpha_t, + float beta1, + float beta2, float weight_decay, - float const *weight_grad_ptr, - size_t size, - int num_replicas, - float *weight_ptr, - float *sgd_v_ptr); - -void sgd_nccl_update_task_gpu(ffStream_t, - float lr, - float momentum, - bool nesterov, - float weight_decay PerDeviceFFHandle const &, - float const *weight_grad_ptr, - size_t size, - float *weight_ptr, - float *sgd_v_ptr); - -void adam_ps_update_task_gpu(ffStream_t, - float alpha_t, - float beta1, - float beta2, - float weight_decay, - float epsilon, - float const *weight_grad_ptr, - float *adam_m_ptr, - float *adam_v_ptr, - float *weight_ptr); - -void adam_nccl_update_task_gpu(ffStream_t, - float alpha_t, - float beta1, - float beta2, - float weight_decay, - float epsilon, - PerDeviceFFHandle const &, - float const *weight_grad_ptr, - float *adam_m_ptr, - float *adam_v_ptr, - float *weight_ptr); + float epsilon, + float const *WGrad, + float *M, + float *V, + float *W); -} // namespace FlexFlow +class AdamOptimizer { +public: + static __host__ void ps_update_task_gpu(AdamOptimizer const *op, + float const *w_grad_ptr, + size_t size, + int num_replicas, + float *w_ptr, + float *v_ptr, + float *m_ptr); +#ifdef FF_USE_NCCL + static __host__ void nccl_update_task_gpu(AdamOptimizer const *op, + PerDeviceOpState const *meta, + float const *w_grad_ptr, + size_t size, + float *w_ptr, + float *v_ptr, + float *m_ptr); #endif + +public: + float alpha; + float alpha_t; + float beta1; + float beta2; + float weight_decay; + float epsilon; +}; + +} // namespace FlexFlow + +#endif // _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h index 64ef1a1352..9a303952d0 100644 --- a/lib/kernels/include/kernels/partition_kernels.h +++ b/lib/kernels/include/kernels/partition_kernels.h @@ -13,8 +13,7 @@ struct RepartitionPerDeviceState { FF_VISITABLE_STRUCT_NO_EQ(RepartitionPerDeviceState, handle, data_type); -namespace Kernels { -namespace Repartition { +namespace Kernels::Repartition { RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle, DataType data_type); @@ -26,11 +25,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, RepartitionPerDeviceState const &m, - GenericTensorAccessorW const &output_grad, - GenericTensorAccessorR const &input_grad); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); -} // namespace Repartition -} // namespace Kernels +} // namespace Kernels::Repartition } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H diff --git a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml similarity index 100% rename from lib/local-execution/include/local-execution/per_device_op_state.variant.toml rename to lib/kernels/include/kernels/per_device_op_state.variant.toml diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h index 798c0507f8..9650859a18 100644 --- a/lib/kernels/include/kernels/pool_2d_kernels.h +++ b/lib/kernels/include/kernels/pool_2d_kernels.h @@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Pool2DPerDeviceState, poolDesc, relu); -namespace Kernels { -namespace Pool2D { +namespace Kernels::Pool2D { Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle, std::optional activation, @@ -70,13 +69,12 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Pool2DPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, void const *output_ptr, - void const *output_grad_ptr); + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr); -} // namespace Pool2D -} // namespace Kernels +} // namespace Kernels::Pool2D } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h index 4287472875..cd3930ea1c 100644 --- a/lib/kernels/include/kernels/reduce_kernels.h +++ b/lib/kernels/include/kernels/reduce_kernels.h @@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT(ReducePerDeviceState, op_type, reduction_size); -namespace Kernels { -namespace Reduce { +namespace Kernels::Reduce { ReducePerDeviceState init_kernel(PerDeviceFFHandle const &, OperatorType const &, @@ -43,8 +42,7 @@ void backward_kernel(ffStream_t stream, ReducePerDeviceState const &m, float const *output_grad_ptr, float *input_grad_ptr); -} // namespace Reduce -} // namespace Kernels +} // namespace Kernels::Reduce } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h index fb3baf215c..12553edd5e 100644 --- a/lib/kernels/include/kernels/reduction_kernels.h +++ b/lib/kernels/include/kernels/reduction_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Reduction { +namespace FlexFlow::Kernels::Reduction { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -14,11 +12,9 @@ void forward_kernel(ffStream_t stream, size_t num_replicas); void backward_kernel(ffStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); -} // namespace Reduction -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reduction #endif // _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h index 409fc81f44..7ed55cd1a1 100644 --- a/lib/kernels/include/kernels/replicate_kernels.h +++ b/lib/kernels/include/kernels/replicate_kernels.h @@ -4,21 +4,17 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Replicate { +namespace FlexFlow::Kernels::Replicate { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas); -} // namespace Replicate -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Replicate #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h new file mode 100644 index 0000000000..1c7aa4ee4a --- /dev/null +++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h @@ -0,0 +1,18 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H + +#include "device.h" +#include "kernels/accessor.h" + +namespace FlexFlow::Kernels::Replicate { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, + size_t num_replicas); + +} // namespace FlexFlow::Kernels::Replicate + +#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index a83caa6bea..6e19a9d251 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -13,8 +13,7 @@ struct ReshapePerDeviceState { FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type); -namespace Kernels { -namespace Reshape { +namespace Kernels::Reshape { ReshapePerDeviceState init_kernel(DataType data_type); @@ -25,11 +24,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, ReshapePerDeviceState const &per_device_state, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); -} // namespace Reshape -} // namespace Kernels +} // namespace Kernels::Reshape } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H diff --git a/lib/kernels/include/kernels/reverse_kernels.h b/lib/kernels/include/kernels/reverse_kernels.h index 42a83ae219..deb5b22155 100644 --- a/lib/kernels/include/kernels/reverse_kernels.h +++ b/lib/kernels/include/kernels/reverse_kernels.h @@ -3,9 +3,7 @@ #include "device.h" -namespace FlexFlow { -namespace Kernels { -namespace Reverse { +namespace FlexFlow::Kernels::Reverse { void forward_kernel(ffStream_t stream, float const *in_ptr, @@ -23,8 +21,6 @@ void backward_kernel(ffStream_t stream, coord_t in_blk_size, coord_t input_size); -} // namespace Reverse -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reverse #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h new file mode 100644 index 0000000000..e482557f93 --- /dev/null +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -0,0 +1,23 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Reverse { + +void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, + int num_out_blks, + int reverse_dim_size, + int in_blk_size); + +void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor, + int num_out_blks, + int reverse_dim_size, + int in_blk_size); + +} // namespace FlexFlow::Kernels::Reverse + +#endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h index 061230ec52..520ea61b64 100644 --- a/lib/kernels/include/kernels/softmax_kernels.h +++ b/lib/kernels/include/kernels/softmax_kernels.h @@ -15,8 +15,7 @@ struct SoftmaxPerDeviceState { FF_VISITABLE_STRUCT(SoftmaxPerDeviceState, handle, inputTensor, dim); -namespace Kernels { -namespace Softmax { +namespace Kernels::Softmax { SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle, int dim, @@ -31,12 +30,11 @@ void forward_kernel(ffStream_t stream, float *output_ptr); void backward_kernel(ffStream_t stream, - float *input_grad_ptr, float const *output_grad_ptr, + float *input_grad_ptr, size_t num_elements); -} // namespace Softmax -} // namespace Kernels +} // namespace Kernels::Softmax } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/split_kernels.h b/lib/kernels/include/kernels/split_kernels.h index 36434d4be8..538b9602c2 100644 --- a/lib/kernels/include/kernels/split_kernels.h +++ b/lib/kernels/include/kernels/split_kernels.h @@ -3,10 +3,7 @@ #include "device.h" -namespace FlexFlow { - -namespace Kernels { -namespace Split { +namespace FlexFlow::Kernels::Split { void forward_kernel(ffStream_t stream, float **out_ptrs, float const *in_ptr, @@ -22,8 +19,6 @@ void backward_kernel(ffStream_t stream, coord_t num_blks, int numOutputs); -} // namespace Split -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Split #endif // _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H diff --git a/lib/kernels/include/kernels/topk_kernels.h b/lib/kernels/include/kernels/topk_kernels.h index ae1c739f6c..6f33381e1a 100644 --- a/lib/kernels/include/kernels/topk_kernels.h +++ b/lib/kernels/include/kernels/topk_kernels.h @@ -12,8 +12,7 @@ struct TopKPerDeviceState { FF_VISITABLE_STRUCT(TopKPerDeviceState, sorted); -namespace Kernels { -namespace TopK { +namespace Kernels::TopK { TopKPerDeviceState init_kernel(bool sorted); @@ -35,8 +34,7 @@ void backward_kernel(ffStream_t stream, int length, int k); -} // namespace TopK -} // namespace Kernels +} // namespace Kernels::TopK } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h index 0f1cc2ae61..0ed10ac03d 100644 --- a/lib/kernels/include/kernels/transpose_kernels.h +++ b/lib/kernels/include/kernels/transpose_kernels.h @@ -8,8 +8,7 @@ namespace FlexFlow { -namespace Kernels { -namespace Transpose { +namespace Kernels::Transpose { void forward_kernel(cudaStream_t stream, TransposeAttrs const &attrs, @@ -18,11 +17,10 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, TransposeAttrs const &attrs, - GenericTensorAccessorW const &in_grad, - GenericTensorAccessorR const &out_grad); + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad); -} // namespace Transpose -} // namespace Kernels +} // namespace Kernels::Transpose } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index 27b7eb390d..43f57717f8 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -1,7 +1,76 @@ #include "kernels/accessor.h" +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" +#include "kernels/local_cpu_allocator.h" +#include +#include namespace FlexFlow { +template +struct AccessorDataIsEqual { + bool operator()(GenericTensorAccessorR const &a, + GenericTensorAccessorR const &b) { + int const num_elements = a.shape.num_elements().unwrap_nonnegative(); + if (num_elements != b.shape.num_elements().unwrap_nonnegative()) { + return false; + } + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + auto cpu_a = copy_accessor_r_to_cpu_if_necessary(a, cpu_allocator); + auto cpu_b = copy_accessor_r_to_cpu_if_necessary(b, cpu_allocator); + + using T = real_type_t
; + T const *a_ptr = cpu_a.get
(); + T const *b_ptr = cpu_b.get
(); + + return std::equal(a_ptr, a_ptr + num_elements, b_ptr); + } +}; + +bool accessor_data_is_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + return DataTypeDispatch1{}( + accessor_a.data_type, accessor_a, accessor_b); +} + +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + return accessor_a.data_type == accessor_b.data_type && + accessor_a.device_type == accessor_b.device_type && + accessor_a.shape == accessor_b.shape && + accessor_data_is_equal(accessor_a, accessor_b); +} + +GenericTensorAccessorW::operator GenericTensorAccessorR() const { + return read_only_accessor_from_write_accessor(*this); +} + +GenericTensorAccessorW::GenericTensorAccessorW( + DataType data_type, + ArrayShape const &shape, + void *ptr, + DeviceType device_type = DeviceType::GPU) + : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + +std::tuple + GenericTensorAccessorW::tie() const { + return std::tie(this->data_type, this->shape, this->ptr, this->device_type); +} + +bool GenericTensorAccessorW::operator==( + GenericTensorAccessorW const &other) const { + return accessors_are_equal(*this, other); +} + +bool GenericTensorAccessorW::operator!=( + GenericTensorAccessorW const &other) const { + return !(accessors_are_equal(*this, other)); +} + int32_t *GenericTensorAccessorW::get_int32_ptr() const { return this->get(); } @@ -33,6 +102,31 @@ std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) { return (s << fmt::to_string(a)); } +GenericTensorAccessorR::GenericTensorAccessorR( + DataType data_type, + ArrayShape const &shape, + void const *ptr, + DeviceType device_type = DeviceType::GPU) + : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + +std::tuple + GenericTensorAccessorR::tie() const { + return std::tie(this->data_type, this->shape, this->ptr, this->device_type); +} + +bool GenericTensorAccessorR::operator==( + GenericTensorAccessorR const &other) const { + return accessors_are_equal(*this, other); +} + +bool GenericTensorAccessorR::operator!=( + GenericTensorAccessorR const &other) const { + return !(accessors_are_equal(*this, other)); +} + int32_t const *GenericTensorAccessorR::get_int32_ptr() const { return this->get(); } @@ -64,51 +158,6 @@ std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) { return (s << fmt::to_string(a)); } -int32_t *get_int32_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -int64_t *get_int64_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -float *get_float_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -double *get_double_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -half *get_half_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -std::vector - get_int32_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_int64_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_float_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_double_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_half_ptrs(std::vector const &a) { - return get(a); -} - int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) { return get(a); } @@ -156,22 +205,17 @@ std::vector GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &writable) { - return GenericTensorAccessorR{ - writable.data_type, writable.shape, req(writable.ptr)}; + return GenericTensorAccessorR{writable.data_type, + writable.shape, + req(writable.ptr), + writable.device_type}; } -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2) { +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2) { return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; } -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype) { - return accessor.shape == expected_shape && - accessor.data_type == expected_dtype; -} - bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, ArrayShape const &expected_shape, DataType const &expected_dtype) { @@ -184,9 +228,4 @@ std::pair return std::make_pair(accessor.shape, accessor.data_type); } -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor) { - return std::make_pair(accessor.shape, accessor.data_type); -} - } // namespace FlexFlow diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index d666592e77..bed8daba51 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -11,11 +11,18 @@ void Allocator::deallocate(void *ptr) { this->i_allocator->deallocate(ptr); } +DeviceType Allocator::get_allocation_device_type() const { + return this->i_allocator->get_allocation_device_type(); +} + GenericTensorAccessorW Allocator::allocate_tensor(TensorShape const &tensor_shape) { void *ptr = this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative()); - return {tensor_shape.data_type, tensor_shape, ptr}; + return {tensor_shape.data_type, + tensor_shape, + ptr, + this->get_allocation_device_type()}; } } // namespace FlexFlow diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 243185ada4..499aebad86 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -11,7 +11,7 @@ static LegionOrdered return LegionOrdered{reversed(vector_of(ff_ordered))}; } -ArrayShape::ArrayShape(nonnegative_int *_dims, nonnegative_int num_dims) +ArrayShape::ArrayShape(nonnegative_int const *_dims, nonnegative_int num_dims) : dims(_dims, _dims + num_dims.unwrap_nonnegative()) {} ArrayShape::ArrayShape(TensorShape const &shape) @@ -51,18 +51,65 @@ nonnegative_int ArrayShape::at(ff_dim_t idx) const { return dims.at(legion_dim_from_ff_dim(idx, this->num_dims())); } +legion_dim_t ArrayShape::last_idx() const { + if (this->dims.size() == 0) { + throw mk_runtime_error("Cannot get last index of an empty shape"); + } + return legion_dim_t(nonnegative_int{this->dims.size() - 1}); +} + +legion_dim_t ArrayShape::neg_idx(int idx) const { + if (std::abs(idx) > this->dims.size()) { + throw mk_runtime_error( + fmt::format("Invalid negative index: {} (shape has {} dimensions)", + idx, + this->dims.size())); + } + + if (idx >= 0) { + throw mk_runtime_error(fmt::format( + "Idx should be negative for negative indexing, got {}", idx)); + } + + return legion_dim_t(nonnegative_int{this->dims.size() + idx}); +} + bool ArrayShape::operator==(ArrayShape const &other) const { - return this->tie() == other.tie(); + return this->dims == other.dims; } bool ArrayShape::operator!=(ArrayShape const &other) const { - return this->tie() != other.tie(); + return !(this->dims == other.dims); } ArrayShape ArrayShape::sub_shape( std::optional> start, std::optional> end) const { - NOT_IMPLEMENTED(); + nonnegative_int num_dims = this->num_dims(); + + auto to_legion_index = [num_dims](auto arg) -> nonnegative_int { + using T = std::decay_t; + if constexpr (std::is_same_v) { + return legion_dim_from_ff_dim(arg, num_dims).value; + } else { + return arg.value; + } + }; + + nonnegative_int start_idx = + (start.has_value()) ? std::visit(to_legion_index, start.value()) : 0_n; + + nonnegative_int end_idx = + (end.has_value()) ? std::visit(to_legion_index, end.value()) : num_dims; + + if (start_idx > num_dims || end_idx > num_dims || start_idx > end_idx) { + throw mk_runtime_error(fmt::format( + "Invalid sub_shape range: start={}, end={}", start_idx, end_idx)); + } + + return ArrayShape(std::vector( + this->dims.begin() + start_idx.unwrap_nonnegative(), + this->dims.begin() + end_idx.unwrap_nonnegative())); } std::optional ArrayShape::at_maybe(legion_dim_t index) const { @@ -74,7 +121,11 @@ std::optional ArrayShape::at_maybe(legion_dim_t index) const { } std::optional ArrayShape::at_maybe(ff_dim_t index) const { - return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims())); + if (index.value < this->num_dims()) { + return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims())); + } else { + return std::nullopt; + } } std::tuple const &> ArrayShape::tie() const { diff --git a/lib/kernels/src/copy_tensor_accessor.cc b/lib/kernels/src/copy_tensor_accessor.cc new file mode 100644 index 0000000000..cc033223f8 --- /dev/null +++ b/lib/kernels/src/copy_tensor_accessor.cc @@ -0,0 +1,107 @@ +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow { + +void copy_accessor_data_to_l_from_r( + GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor) { + size_t num_bytes = + dst_accessor.shape.get_volume().unwrap_nonnegative() * + size_of_datatype(dst_accessor.data_type).unwrap_nonnegative(); + + DeviceType dst_device_type = dst_accessor.device_type; + DeviceType src_device_type = src_accessor.device_type; + + if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::CPU) { + memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes); + } else if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::GPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice)); + } else if (src_device_type == DeviceType::GPU && + dst_device_type == DeviceType::CPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost)); + } else { + assert(src_device_type == DeviceType::GPU); + assert(dst_device_type == DeviceType::GPU); + checkCUDA(cudaMemcpy(dst_accessor.ptr, + src_accessor.ptr, + num_bytes, + cudaMemcpyDeviceToDevice)); + } +} + +template +struct CopyTensorAccessorW { + GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return dst_accessor; + } +}; + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, allocator); +} + +template +struct CopyTensorAccessorR { + GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return read_only_accessor_from_write_accessor(dst_accessor); + } +}; + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, allocator); +} + +GenericTensorAccessorR + copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, + Allocator &cpu_allocator) { + if (cpu_allocator.get_allocation_device_type() == DeviceType::GPU) { + throw mk_runtime_error("Allocator must be a CPU allocator"); + } + + GenericTensorAccessorR cpu_accessor = accessor; + if (accessor.device_type == DeviceType::GPU) { + cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator); + } + return cpu_accessor; +} + +GenericTensorAccessorW + copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor, + Allocator &cpu_allocator) { + if (cpu_allocator.get_allocation_device_type() == DeviceType::GPU) { + throw mk_runtime_error("Allocator must be a CPU allocator"); + } + + GenericTensorAccessorW cpu_accessor = accessor; + if (accessor.device_type == DeviceType::GPU) { + cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator); + } + return cpu_accessor; +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc new file mode 100644 index 0000000000..cdd57b8947 --- /dev/null +++ b/lib/kernels/src/cpu/cast_kernels.cc @@ -0,0 +1,51 @@ +#include "kernels/cast_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow::Kernels::Cast { + +template +void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) { + for (size_t i = 0; i < volume; ++i) { + output[i] = static_cast(input[i]); + } +} + +template +void cpu_cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) { + for (size_t i = 0; i < volume; i++) { + output[i] = static_cast(input[i]) + beta * output[i]; + } +} + +template +struct CPUForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + size_t volume = input.shape.get_volume().unwrap_nonnegative(); + cpu_cast_forward(input.get(), output.get(), volume); + } +}; + +template +struct CPUBackwardKernel { + void operator()(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + size_t volume = output.shape.get_volume().unwrap_nonnegative(); + cpu_cast_backward( + output.get(), input.get(), volume, cast_to(1.0f)); + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch2{}( + input.data_type, output.data_type, input, output); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch2{}( + output.data_type, input.data_type, output, input); +} + +} // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc new file mode 100644 index 0000000000..577984f21a --- /dev/null +++ b/lib/kernels/src/cpu/combine_kernels.cc @@ -0,0 +1,39 @@ +#include "kernels/combine_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow::Kernels::Combine { + +template +struct CPUForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + memcpy(output.get
(), + input.get
(), + input.shape.get_volume().unwrap_nonnegative() * + size_of_datatype(DT).unwrap_nonnegative()); + } +}; + +template +struct CPUBackwardKernel { + void operator()(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative(); + for (int i = 0; i < num_elements; ++i) { + input_grad.get
()[i] += output_grad.get
()[i]; + } + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, input, output); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + DataTypeDispatch1{}( + input_grad.data_type, output_grad, input_grad); +} + +} // namespace FlexFlow::Kernels::Combine diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc new file mode 100644 index 0000000000..cdb030d2ff --- /dev/null +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -0,0 +1,47 @@ +#include "kernels/datatype_dispatch.h" +#include "kernels/replicate_kernels_cpu.h" + +namespace FlexFlow::Kernels::Replicate { + +template +struct CPUForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output) { + memcpy(output.get
(), + input.get
(), + input.shape.num_elements().unwrap_nonnegative() * + size_of_datatype(DT).unwrap_nonnegative()); + } +}; + +template +struct CPUBackwardKernel { + void operator()(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, + size_t num_elements, + size_t num_replicas) { + using T = real_type_t
; + for (int i = 0; i < num_elements; i++) { + T cur_sum = 0; + for (int j = 0; j < num_replicas; j++) { + cur_sum += output.at
({i, j}); + } + input.at
({i}) = cur_sum; + } + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output) { + DataTypeDispatch1{}(input.data_type, input, output); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, + size_t num_replicas) { + size_t num_elements = input.shape.num_elements().unwrap_nonnegative(); + DataTypeDispatch1{}( + input.data_type, output, input, num_elements, num_replicas); +} + +} // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc new file mode 100644 index 0000000000..e259d059ff --- /dev/null +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -0,0 +1,51 @@ +#include "kernels/datatype_dispatch.h" +#include "kernels/reverse_kernels_cpu.h" +#include + +namespace FlexFlow::Kernels::Reverse { + +template +struct CPUReverseForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + int num_out_blks, + int reverse_dim_size, + int in_blk_size) { + for (int blk_idx = 0; blk_idx < num_out_blks; blk_idx++) { + for (int rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { + for (int inner_idx = 0; inner_idx < in_blk_size; inner_idx++) { + output.at
({inner_idx, rev_idx, blk_idx}) = input.at
( + {inner_idx, reverse_dim_size - 1 - rev_idx, blk_idx}); + } + } + } + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, + int num_out_blks, + int reverse_dim_size, + int in_blk_size) { + DataTypeDispatch1{}(input_accessor.data_type, + input_accessor, + output_accessor, + num_out_blks, + reverse_dim_size, + in_blk_size); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor, + int num_out_blks, + int reverse_dim_size, + int in_blk_size) { + DataTypeDispatch1{}(output_accessor.data_type, + output_accessor, + input_accessor, + num_out_blks, + reverse_dim_size, + in_blk_size); +} + +} // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu index 66388c0ec8..4669955019 100644 --- a/lib/kernels/src/cuda/cuda_helper.cu +++ b/lib/kernels/src/cuda/cuda_helper.cu @@ -29,13 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) { #error "Unknown device, please make sure if CUDA is enabled" #endif -__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { +__global__ void scale_kernel(float *ptr, size_t size, float a, float b) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = (b - a) * ptr[i] + a; } } -__global__ void ones_kernel(float *ptr, coord_t size) { +__global__ void ones_kernel(float *ptr, size_t size) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = 1.0f; } @@ -49,7 +49,7 @@ __global__ void assign_kernel(DT *ptr, size_t size, DT value) { } template -__global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { +__global__ void copy_kernel(DT *dst, const DT *src, size_t size) { CUDA_KERNEL_LOOP(i, size) { dst[i] = src[i]; } @@ -281,11 +281,11 @@ template __global__ void add_kernel(bool *dst, bool const *src, unsigned long size); template __global__ void - copy_kernel(float *dst, float const *src, coord_t size); + copy_kernel(float *dst, float const *src, size_t size); template __global__ void - copy_kernel(int32_t *dst, int32_t const *src, coord_t size); + copy_kernel(int32_t *dst, int32_t const *src, size_t size); template __global__ void - copy_kernel(int64_t *dst, int64_t const *src, coord_t size); + copy_kernel(int64_t *dst, int64_t const *src, size_t size); template __global__ void apply_add_with_scale(float *data_ptr, float const *grad_ptr, diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu index e6a614ba70..7ea1d3b9d3 100644 --- a/lib/kernels/src/cuda/embedding_kernels.cu +++ b/lib/kernels/src/cuda/embedding_kernels.cu @@ -17,12 +17,11 @@ #include "kernels/datatype_dispatch.h" #include "kernels/embedding_kernels.h" -namespace FlexFlow { -namespace Kernels { -namespace Embedding { +namespace FlexFlow::Kernels::Embedding { void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); // Randomly initialize the intput tensor to avoid out of index range issues rand_generate_int<<>>( @@ -31,36 +30,14 @@ void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p) { cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); // Randomly initialize the intput tensor to avoid out of index range issues rand_generate_int<<>>( ptr, size, p); } -template -__global__ void embed_forward_no_aggr( - TI const *input, TD *output, TD const *embed, int out_dim, int batch_size); -template -__global__ void embed_forward_with_aggr(TI const *input, - TD *output, - TD const *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr); -template -__global__ void embed_backward_no_aggr( - TI const *input, TD const *output, TD *embed, int out_dim, int batch_size); -template -__global__ void embed_backward_with_aggr(TI const *input, - TD const *output, - TD *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr); - -template +template __global__ void embed_forward_no_aggr(int32_t const *input, TD *output, TD const *embed, @@ -75,7 +52,7 @@ __global__ void embed_forward_no_aggr(int32_t const *input, } } -template +template __global__ void embed_forward_no_aggr(int64_t const *input, TD *output, TD const *embed, @@ -90,14 +67,14 @@ __global__ void embed_forward_no_aggr(int64_t const *input, } } -template +template __global__ void embed_forward_with_aggr(int32_t const *input, TD *output, TD const *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; @@ -115,14 +92,14 @@ __global__ void embed_forward_with_aggr(int32_t const *input, } } -template +template __global__ void embed_forward_with_aggr(int64_t const *input, TD *output, TD const *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; @@ -140,7 +117,7 @@ __global__ void embed_forward_with_aggr(int64_t const *input, } } -template +template __global__ void embed_backward_no_aggr(int32_t const *input, TD const *output, TD *embed, @@ -154,7 +131,7 @@ __global__ void embed_backward_no_aggr(int32_t const *input, } } -template +template __global__ void embed_backward_no_aggr(int64_t const *input, TD const *output, TD *embed, @@ -171,11 +148,11 @@ __global__ void embed_backward_no_aggr(int64_t const *input, // Specialization for half type template <> -__global__ void embed_backward_no_aggr(int32_t const *input, - half const *output, - half *embed, - int out_dim, - int batch_size) { +__global__ void embed_backward_no_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; @@ -192,11 +169,11 @@ __global__ void embed_backward_no_aggr(int32_t const *input, } template <> -__global__ void embed_backward_no_aggr(int64_t const *input, - half const *output, - half *embed, - int out_dim, - int batch_size) { +__global__ void embed_backward_no_aggr(int64_t const *input, + half const *output, + half *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; @@ -212,14 +189,14 @@ __global__ void embed_backward_no_aggr(int64_t const *input, } } -template +template __global__ void embed_backward_with_aggr(int32_t const *input, TD const *output, TD *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -238,14 +215,14 @@ __global__ void embed_backward_with_aggr(int32_t const *input, } } -template +template __global__ void embed_backward_with_aggr(int64_t const *input, TD const *output, TD *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -267,14 +244,13 @@ __global__ void embed_backward_with_aggr(int64_t const *input, // Specialization for half type template <> -__global__ void - embed_backward_with_aggr(int32_t const *input, - half const *output, - half *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr) { +__global__ void embed_backward_with_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -301,14 +277,13 @@ __global__ void } template <> -__global__ void - embed_backward_with_aggr(int64_t const *input, - half const *output, - half *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr) { +__global__ void embed_backward_with_aggr(int64_t const *input, + half const *output, + half *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -351,35 +326,229 @@ struct ForwardKernel { int in_dim, int out_dim, int batch_size) { - assert(input.data_type == DataType::INT32 || - input.data_type == DataType::INT64); - assert(weight.data_type == DataType::HALF || - weight.data_type == DataType::FLOAT || - weight.data_type == DataType::DOUBLE); + throw mk_runtime_error(fmt::format( + "Invalid type combination: input type {} and output type {}", TI, TD)); + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { if (!aggr.has_value()) { - embed_forward_no_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight.get(), + stream>>>(input.get(), + output.get(), + weight.get(), out_dim, batch_size); } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - embed_forward_with_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight.get(), + stream>>>(input.get(), + output.get(), + weight.get(), out_dim, in_dim, batch_size, - aggr); + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); } } }; @@ -388,39 +557,229 @@ template struct BackwardKernel { void operator()(cudaStream_t stream, std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + throw mk_runtime_error(fmt::format( + "Invalid type combination: input type {} and output type {}", TI, TD)); + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, int in_dim, int out_dim, int batch_size) { - assert(input.data_type == DataType::INT32 || - input.data_type == DataType::INT64); - assert(output.data_type == DataType::HALF || - output.data_type == DataType::FLOAT || - output.data_type == DataType::DOUBLE); if (!aggr.has_value()) { - embed_backward_no_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight_grad.get(), + stream>>>(input.get(), + output.get(), + weight_grad.get(), out_dim, batch_size); } else { - embed_backward_with_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight_grad.get(), + stream>>>(input.get(), + output.get(), + weight_grad.get(), out_dim, in_dim, batch_size, - aggr); + aggr.value()); } } }; @@ -448,27 +807,25 @@ void forward_kernel(ffStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorR const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, - DataType input_data_type, DataType output_data_type, + DataType input_data_type, std::optional aggr, int in_dim, int out_dim, int batch_size) { - DataTypeDispatch2{}(input_data_type, - output_data_type, + DataTypeDispatch2{}(output_data_type, + input_data_type, stream, aggr, - input, output, + input, weight_grad, in_dim, out_dim, batch_size); } -} // namespace Embedding -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Embedding diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu index 2e037eb472..112f84c90c 100644 --- a/lib/kernels/src/cuda/metrics_functions.cu +++ b/lib/kernels/src/cuda/metrics_functions.cu @@ -13,17 +13,42 @@ * limitations under the License. */ -#include "flexflow/model.h" -#include "flexflow/utils/cuda_helper.h" +#include "device.h" +#include "kernels/metrics_kernels.h" +#include "kernels/perf_metrics.h" +#include "pcg/metric_attrs.h" namespace FlexFlow { +struct CUDAPerfMetrics { + int train_all; + int train_correct; + float cce_loss; + float sparse_cce_loss; + float mse_loss; + float rmse_loss; + float mae_loss; + double start_time; + double current_time; + + CUDAPerfMetrics() = delete; + CUDAPerfMetrics(PerfMetrics const &perf) + : train_all(perf.train_all), + train_correct(perf.train_correct.value_or(-1)), + cce_loss(perf.cce_loss.value_or(-1)), + sparse_cce_loss(perf.sparse_cce_loss.value_or(-1)), + mse_loss(perf.mse_loss.value_or(-1)), + rmse_loss(perf.rmse_loss.value_or(-1)), + mae_loss(perf.mae_loss.value_or(-1)), start_time(perf.start_time), + current_time(perf.current_time) {} +}; + float const LOG_MIN_VALUE = 0.00000001f; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, - PerfMetrics *perf, - const Metrics metrics, + CUDAPerfMetrics *perf, + const MetricsAttrs metrics, int num_samples, int num_classes) { CUDA_KERNEL_LOOP(b, num_samples) { @@ -72,8 +97,8 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, __global__ void update_metrics_label_kernel(float const *logits, float const *labels, - PerfMetrics *perf, - const Metrics metrics, + CUDAPerfMetrics *perf, + const MetricsAttrs metrics, int num_samples, int num_classes) { CUDA_KERNEL_LOOP(b, num_samples) { @@ -136,17 +161,17 @@ __global__ void update_metrics_label_kernel(float const *logits, } } -void Metrics::update_metrics_sparse_label_kernel_wrapper( - float const *logit_ptr, - int const *label_ptr, - Metrics const *me, - int num_effective_samples, - int num_classes, - PerfMetrics &perf_zc) { - PerfMetrics *perf; - checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics))); - checkCUDA( - cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice)); +void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, + int const *label_ptr, + MetricsAttrs const &me, + int num_effective_samples, + int num_classes, + PerfMetrics &perf_zc) { + CUDAPerfMetrics perf(perf_zc); + CUDAPerfMetrics *perf_cuda; + checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics))); + checkCUDA(cudaMemcpy( + perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice)); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -154,32 +179,33 @@ void Metrics::update_metrics_sparse_label_kernel_wrapper( CUDA_NUM_THREADS, 0, stream>>>( - logit_ptr, label_ptr, perf, *me, num_effective_samples, num_classes); + logit_ptr, label_ptr, perf_cuda, me, num_effective_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); - checkCUDA( - cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost)); - checkCUDA(cudaFree(perf)); + checkCUDA(cudaMemcpy( + &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); + checkCUDA(cudaFree(perf_cuda)); } -void Metrics::update_metrics_label_kernel_wrapper(float const *logit_ptr, - float const *label_ptr, - Metrics const *me, - int num_samples, - int num_classes, - PerfMetrics &perf_zc) { - PerfMetrics *perf; - checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics))); - checkCUDA( - cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice)); +void update_metrics_label_kernel_wrapper(float const *logit_ptr, + float const *label_ptr, + MetricsAttrs const &me, + int num_samples, + int num_classes, + PerfMetrics &perf_zc) { + CUDAPerfMetrics perf(perf_zc); + CUDAPerfMetrics *perf_cuda; + checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics))); + checkCUDA(cudaMemcpy( + perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice)); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); update_metrics_label_kernel<<>>( - logit_ptr, label_ptr, perf, *me, num_samples, num_classes); + logit_ptr, label_ptr, perf_cuda, me, num_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); - checkCUDA( - cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost)); - checkCUDA(cudaFree(perf)); + checkCUDA(cudaMemcpy( + &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); + checkCUDA(cudaFree(perf_cuda)); } }; // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu index 4e153a028e..98c528cd7b 100644 --- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu +++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu @@ -53,9 +53,9 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, BatchNormPerDeviceState const &m, - float const *input_ptr, - float *output_grad_ptr, float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, float *input_grad_ptr, float const *scale_ptr, float *scale_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu index fe7aec68b9..230ca70627 100644 --- a/lib/kernels/src/cuda/ops/cast_kernels.cu +++ b/lib/kernels/src/cuda/ops/cast_kernels.cu @@ -50,30 +50,26 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - size_t volume = input.shape.get_volume().unwrap_nonnegative(); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + size_t volume = output.shape.get_volume().unwrap_nonnegative(); cast_backward<<>>( - input.get(), output.get(), volume, cast_to(1.0f)); + output.get(), input.get(), volume, cast_to(1.0f)); } }; void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + input.data_type, output.data_type, stream, input, output); } void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + output.data_type, input.data_type, stream, output, input); } } // namespace Cast diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu index 2715ff16e9..683dbbaac5 100644 --- a/lib/kernels/src/cuda/ops/concat_kernels.cu +++ b/lib/kernels/src/cuda/ops/concat_kernels.cu @@ -17,46 +17,54 @@ #include "kernels/concat_kernels.h" #include -namespace FlexFlow { -namespace Kernels { -namespace Concat { +namespace FlexFlow::Kernels::Concat { void calc_blk_size(size_t &num_blocks, size_t &blk_size, ArrayShape const &shape, ff_dim_t axis) { - blk_size = shape.sub_shape(legion_dim_t{0_n}, axis) + legion_dim_t legion_axis = (legion_dim_from_ff_dim(axis, shape.num_dims())); + assert(legion_axis.value < shape.num_dims()); + if (legion_axis.value == 0_n) { + legion_axis.value = 1_n; + } + blk_size = shape.sub_shape(legion_dim_t{0_n}, legion_axis) .num_elements() .unwrap_nonnegative(); - num_blocks = - shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative(); + num_blocks = shape.sub_shape(legion_axis, std::nullopt) + .num_elements() + .unwrap_nonnegative(); } void forward_kernel(cudaStream_t stream, GenericTensorAccessorW const &output, std::vector const &inputs, ff_dim_t axis) { - size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; - int num_inputs = inputs.size(); - assert(num_inputs <= MAX_NUM_INPUTS); + assert(inputs.size() <= MAX_NUM_INPUTS); + size_t num_blocks = 1, output_blk_size = 1; calc_blk_size(num_blocks, output_blk_size, output.shape, axis); - for (int i = 0; i < num_inputs; i++) { - size_t input_num_blocks = 1; - calc_blk_size(input_num_blocks, input_blk_sizes[i], inputs[i].shape, axis); - assert(input_num_blocks == num_blocks); - } - off_t offset = 0; - for (int i = 0; i < num_inputs; i++) { - copy_with_stride<<>>(output.get_float_ptr() + offset, - inputs[i].get_float_ptr(), - num_blocks, + input.get_float_ptr(), + blocks_to_copy, output_blk_size, - input_blk_sizes[i]); - offset += input_blk_sizes[i]; + input_blk_size); + + offset += (output_blk_size == input_blk_size) + ? input_blk_size * input_num_blocks + : input_blk_size; } } @@ -64,32 +72,32 @@ void backward_kernel(cudaStream_t stream, GenericTensorAccessorR const &output_grad, std::vector const &input_grads, ff_dim_t axis) { - size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; - int num_inputs = input_grads.size(); - assert(num_inputs <= MAX_NUM_INPUTS); - + assert(input_grads.size() <= MAX_NUM_INPUTS); + size_t num_blocks = 1, output_blk_size = 1; calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis); - for (int i = 0; i < num_inputs; i++) { - ArrayShape shape = input_grads[i].shape; - size_t input_num_blocks = 1; - calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, axis); - assert(input_num_blocks == num_blocks); - } - off_t offset = 0; - for (int i = 0; i < num_inputs; i++) { - add_with_stride<<>>(input_grads[i].get_float_ptr(), + stream>>>(input_grad.get_float_ptr(), output_grad.get_float_ptr() + offset, - num_blocks, - input_blk_sizes[i], + blocks_to_add, + input_blk_size, output_blk_size); - offset += input_blk_sizes[i]; + + offset += (output_blk_size == input_blk_size) + ? input_blk_size * input_num_blocks + : input_blk_size; } } -} // namespace Concat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Concat diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu index dac55539d2..32e749e15a 100644 --- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu @@ -313,10 +313,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *filter_ptr, float *filter_grad_ptr, float *bias_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu index 056c80ecf6..e096803682 100644 --- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu +++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu @@ -290,10 +290,10 @@ struct BackwardKernel { OperatorType op_type, std::optional scalar, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { checkCUDNN(cudnnSetStream(handle.dnn, stream)); if (use_cudnn(op_type)) { @@ -356,20 +356,20 @@ void backward_kernel(ffStream_t stream, ElementUnaryPerDeviceState const &device_state, ElementUnaryAttrs const &attrs, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { DataTypeDispatch1{}(input.data_type, stream, device_state, get_op_type(attrs), attrs.scalar, handle, - input, - input_grad, output, - output_grad); + output_grad, + input, + input_grad); } } // namespace ElementUnary diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu index 973d05f596..14bb3bddd1 100644 --- a/lib/kernels/src/cuda/ops/flat_kernels.cu +++ b/lib/kernels/src/cuda/ops/flat_kernels.cu @@ -35,8 +35,8 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, GenericTensorAccessorR input, - float *input_grad_ptr, - float const *output_grad_ptr) { + float const *output_grad_ptr, + float *input_grad_ptr) { float alpha = 1.0f; apply_add_with_scale diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu index 31c1bac217..99034089b5 100644 --- a/lib/kernels/src/cuda/ops/gather_kernels.cu +++ b/lib/kernels/src/cuda/ops/gather_kernels.cu @@ -18,9 +18,7 @@ #include "kernels/device.h" #include "kernels/gather_kernels.h" -namespace FlexFlow { -namespace Kernels { -namespace Gather { +namespace FlexFlow::Kernels::Gather { template __global__ void gather_forward(float const *input, @@ -125,11 +123,14 @@ void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &index, GenericTensorAccessorW const &output) { checkCUDA(get_legion_stream(&stream)); - coord_t stride = output.shape.sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1)) .num_elements() .unwrap_nonnegative(); + if (m.legion_dim.value == 0_n) { + stride = 1; + } + coord_t output_dim_size = output.shape.at(m.legion_dim).unwrap_nonnegative(); coord_t input_dim_size = input.shape.at(m.legion_dim).unwrap_nonnegative(); @@ -158,8 +159,12 @@ void backward_kernel(ffStream_t stream, coord_t stride = output_grad.shape .sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1)) - .get_volume() + .num_elements() .unwrap_nonnegative(); + if (m.legion_dim.value == 0_n) { + stride = 1; + } + coord_t output_dim_size = output_grad.shape.at(m.legion_dim).unwrap_nonnegative(); coord_t input_dim_size = @@ -180,6 +185,4 @@ void backward_kernel(ffStream_t stream, output_dim_size); } -} // namespace Gather -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Gather diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu index ca51f0d216..0d5a772918 100644 --- a/lib/kernels/src/cuda/ops/linear_kernels.cu +++ b/lib/kernels/src/cuda/ops/linear_kernels.cu @@ -108,10 +108,10 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, void forward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *output_ptr, - void const *weight_ptr, - void const *bias_ptr, + float const *input_ptr, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, int in_dim, int out_dim, int batch_size) { @@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream, batch_size, in_dim, &alpha, - weight_ptr, + static_cast(weight_ptr), weight_type, in_dim, - input_ptr, + static_cast(input_ptr), input_type, in_dim, &beta, - output_ptr, + static_cast(output_ptr), output_type, out_dim, compute_type, @@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream, batch_size, 1, &alpha, - bias_ptr, + static_cast(bias_ptr), weight_type, 1, - m.one_ptr, + static_cast(m.one_ptr), CUDA_R_32F, 1, &alpha, - output_ptr, + static_cast(output_ptr), output_type, out_dim, compute_type, @@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream, m.actiDesc, &alpha, m.outputTensor, - output_ptr, + static_cast(output_ptr), &beta, m.outputTensor, - output_ptr)); + static_cast(output_ptr))); } else if (m.activation == Activation::GELU) { size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size); constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) @@ -191,13 +191,13 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, - void const *output_ptr, - void *output_grad_ptr, - void const *kernel_ptr, - void *kernel_grad_ptr, - void *bias_grad_ptr, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_grad_ptr, int in_dim, int out_dim, int batch_size) { @@ -216,11 +216,17 @@ void backward_kernel(cudaStream_t stream, int output_size = out_dim * batch_size; if (m.activation.has_value()) { if (m.activation == Activation::RELU) { - relu_backward_kernel( - m.output_type, output_grad_ptr, output_ptr, output_size, stream); + relu_backward_kernel(m.output_type, + static_cast(output_grad_ptr), + static_cast(output_ptr), + output_size, + stream); } else if (m.activation == Activation::SIGMOID) { - sigmoid_backward_kernel( - m.output_type, output_grad_ptr, output_ptr, output_size, stream); + sigmoid_backward_kernel(m.output_type, + static_cast(output_grad_ptr), + static_cast(output_ptr), + output_size, + stream); } else { // TODO: only support relu and sigmoid for now assert(false && "Unsupported activation for Linear"); @@ -235,14 +241,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - input_ptr, + static_cast(input_ptr), input_type, in_dim, - output_grad_ptr, + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - kernel_grad_ptr, + static_cast(kernel_grad_ptr), weight_type, in_dim, compute_type, @@ -261,12 +267,12 @@ void backward_kernel(cudaStream_t stream, in_dim, out_dim, &alpha, - (float *)kernel_grad_ptr, + kernel_grad_ptr, in_dim, &lambda, - (float *)kernel_ptr, + kernel_ptr, in_dim, - (float *)kernel_grad_ptr, + kernel_grad_ptr, in_dim)); } else { assert(false && "Only L2 regularization is supported"); @@ -284,14 +290,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - m.one_ptr, + static_cast(m.one_ptr), CUDA_R_32F, 1, - output_grad_ptr, + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - bias_grad_ptr, + static_cast(bias_grad_ptr), weight_type, 1, compute_type, @@ -307,14 +313,14 @@ void backward_kernel(cudaStream_t stream, batch_size, out_dim, &alpha, - kernel_ptr, + static_cast(kernel_ptr), weight_type, in_dim, - output_grad_ptr, + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - input_grad_ptr, + static_cast(input_grad_ptr), input_type, in_dim, compute_type, diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu index 2831562f58..67d5c25c3b 100644 --- a/lib/kernels/src/cuda/ops/partition_kernels.cu +++ b/lib/kernels/src/cuda/ops/partition_kernels.cu @@ -40,8 +40,8 @@ template struct BackwardKernel { void operator()(cudaStream_t stream, RepartitionPerDeviceState const &m, - GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { add_kernel> <<{}( - m.data_type, stream, m, input_grad, output_grad); + m.data_type, stream, m, output_grad, input_grad); } } // namespace Repartition diff --git a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu index 51fa29d289..f8b35ec885 100644 --- a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu @@ -112,10 +112,10 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, Pool2DPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, void const *output_ptr, - void const *output_grad_ptr) { + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu index 5d95a3766a..0ef7e304cf 100644 --- a/lib/kernels/src/cuda/ops/reduction_kernels.cu +++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu @@ -55,8 +55,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { checkCUDA(cudaMemcpyAsync(input.get(), output.get(), input.shape.num_elements().unwrap_nonnegative() * @@ -75,9 +75,9 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { - DataTypeDispatch1{}(input.data_type, stream, input, output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch1{}(output.data_type, stream, output, input); } } // namespace Reduction diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu index 4706f38fd4..78022e869b 100644 --- a/lib/kernels/src/cuda/ops/replicate_kernels.cu +++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu @@ -22,8 +22,8 @@ namespace Kernels { namespace Replicate { template -__global__ void replicate_backward_kernel(T *input_ptr, - T const *output_ptr, +__global__ void replicate_backward_kernel(T const *output_ptr, + T *input_ptr, size_t num_elements, size_t num_replicas) { CUDA_KERNEL_LOOP(i, num_elements) { @@ -38,7 +38,6 @@ struct ForwardKernel { void operator()(cudaStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - checkCUDA(cudaMemcpyAsync((void *)output.get(), (void *)input.get(), input.shape.num_elements().unwrap_nonnegative() * @@ -51,15 +50,15 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas) { size_t total_elements = input.shape.num_elements().unwrap_nonnegative() * num_replicas; replicate_backward_kernel> <<>>( - input.get(), output.get(), + input.get(), input.shape.num_elements().unwrap_nonnegative(), num_replicas); } @@ -72,11 +71,11 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas) { DataTypeDispatch1{}( - input.data_type, stream, input, output, num_replicas); + input.data_type, stream, output, input, num_replicas); } } // namespace Replicate diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu index c5a289ce6b..374dfb22ba 100644 --- a/lib/kernels/src/cuda/ops/reshape_kernels.cu +++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu @@ -43,8 +43,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { float alpha = 1.0f; apply_add_with_scale> <<{}(m.data_type, stream, input, output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch1{}(m.data_type, stream, output, input); } } // namespace Reshape diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index 8391a499df..367e337b18 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -17,7 +17,6 @@ #include "kernels/reverse_kernels.h" namespace FlexFlow { - namespace Kernels { namespace Reverse { @@ -27,13 +26,14 @@ __global__ void reverse_forward_kernel(float const *in_ptr, coord_t reverse_dim_size, coord_t in_blk_size) { CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { + coord_t out_idx = i; coord_t blk_idx = i / (reverse_dim_size * in_blk_size); i = i - blk_idx * (reverse_dim_size * in_blk_size); coord_t reverse_dim_idx = i / in_blk_size; i = i - reverse_dim_idx * in_blk_size; coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i; - out_ptr[i] = in_ptr[in_idx]; + out_ptr[out_idx] = in_ptr[in_idx]; } } diff --git a/lib/kernels/src/cuda/ops/softmax_kernels.cu b/lib/kernels/src/cuda/ops/softmax_kernels.cu index 93ed85de18..d2498d08a4 100644 --- a/lib/kernels/src/cuda/ops/softmax_kernels.cu +++ b/lib/kernels/src/cuda/ops/softmax_kernels.cu @@ -61,8 +61,8 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - float *input_grad_ptr, float const *output_grad_ptr, + float *input_grad_ptr, size_t num_elements) { checkCUDA(cudaMemcpyAsync(input_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu index 60d2f7f342..e1aaacc7f9 100644 --- a/lib/kernels/src/cuda/ops/transpose_kernels.cu +++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu @@ -100,8 +100,8 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, TransposeAttrs const &m, - GenericTensorAccessorW const &in_grad, - GenericTensorAccessorR const &out_grad) { + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad) { TransposeStrides info; info.num_dim = in_grad.shape.num_dims().unwrap_nonnegative(); diff --git a/lib/kernels/src/cuda/optimizer_kernel.cu b/lib/kernels/src/cuda/optimizer_kernels.cu similarity index 83% rename from lib/kernels/src/cuda/optimizer_kernel.cu rename to lib/kernels/src/cuda/optimizer_kernels.cu index 439eed9dec..8aab79ba65 100644 --- a/lib/kernels/src/cuda/optimizer_kernel.cu +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -13,7 +13,9 @@ * limitations under the License. */ +#include "device.h" #include "kernels/optimizer_kernels.h" +#include "utils/exception.h" namespace FlexFlow { @@ -80,13 +82,24 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); + + auto const &state = meta->raw_variant; + ncclComm_t comm = std::visit( + [](auto const &s) -> ncclComm_t { + using T = std::decay_t; + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + throw mk_runtime_error("State type does not support NCCL operations"); + } else { + return s.handle.ncclComm; + } + }, + state); + + checkNCCL(ncclAllReduce( + w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream)); + // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr); // print_tensor((float*)w_grad_ptr, 16, "[After ncclAllReduce]"); @@ -157,7 +170,7 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, for (int i = 1; i < num_replicas; i++) { float const *src = w_grad_ptr + i * size; add_kernel<<>>( - size, 1.0f, src, (float *)w_grad_ptr); + (float *)w_grad_ptr, src, size); } // checkCUDA(cudaDeviceSynchronize()); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", @@ -188,13 +201,23 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, // Use NCCL to sync gradients cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); + + auto const &state = meta->raw_variant; + ncclComm_t comm = std::visit( + [](auto const &s) -> ncclComm_t { + using T = std::decay_t; + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + throw mk_runtime_error("State type does not support NCCL operations"); + } else { + return s.handle.ncclComm; + } + }, + state); + + checkNCCL(ncclAllReduce( + w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream)); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", // op->alpha, op->alpha_t, op->weight_decay); // Step 2: Adam update diff --git a/lib/kernels/src/hip/embedding_kernels.cpp b/lib/kernels/src/hip/embedding_kernels.cpp index 7ca3149f2f..06b42d420a 100644 --- a/lib/kernels/src/hip/embedding_kernels.cpp +++ b/lib/kernels/src/hip/embedding_kernels.cpp @@ -364,8 +364,8 @@ struct ForwardKernel { weight.data_type == DataType::FLOAT || weight.data_type == DataType::DOUBLE); - if (aggr == AggregateOp::NONE) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr), + if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -374,10 +374,11 @@ struct ForwardKernel { output.get(), weight.get(), out_dim, - batch_size); + in_dim, + batch_size, + aggr); } else { - assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr), + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -386,9 +387,7 @@ struct ForwardKernel { output.get(), weight.get(), out_dim, - in_dim, - batch_size, - aggr); + batch_size); } } } @@ -408,8 +407,9 @@ struct BackwardKernel { assert(output.data_type == DataType::HALF || output.data_type == DataType::FLOAT || output.data_type == DataType::DOUBLE); - if (aggr == AggregateOp::NONE) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr), + + if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -418,9 +418,11 @@ struct BackwardKernel { output.get(), weight_grad.get(), out_dim, - batch_size); + in_dim, + batch_size, + aggr); } else { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr), + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -429,9 +431,7 @@ struct BackwardKernel { output.get(), weight_grad.get(), out_dim, - in_dim, - batch_size, - aggr); + batch_size); } } } diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc index bbb15c5636..14016a6202 100644 --- a/lib/kernels/src/legion_dim.cc +++ b/lib/kernels/src/legion_dim.cc @@ -9,8 +9,7 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) { legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, nonnegative_int num_dimensions) { - return legion_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() - - ff_dim.value.unwrap_nonnegative() - 1}}; + return legion_dim_t{num_dimensions - ff_dim.value - 1_n}; } } // namespace FlexFlow diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc new file mode 100644 index 0000000000..adc31b2c6b --- /dev/null +++ b/lib/kernels/src/local_cpu_allocator.cc @@ -0,0 +1,29 @@ +#include "kernels/local_cpu_allocator.h" +#include "kernels/device.h" +#include "utils/containers/contains_key.h" + +namespace FlexFlow { +void *LocalCPUAllocator::allocate(size_t requested_memory_size) { + void *ptr = malloc(requested_memory_size); + this->ptrs.insert({ptr, std::unique_ptr(ptr, free)}); + return ptr; +} + +void LocalCPUAllocator::deallocate(void *ptr) { + if (contains_key(this->ptrs, ptr)) { + this->ptrs.erase(ptr); + } else { + throw std::runtime_error( + "Deallocating a pointer that was not allocated by this Allocator"); + } +} + +DeviceType LocalCPUAllocator::get_allocation_device_type() const { + return DeviceType::CPU; +} + +Allocator create_local_cpu_memory_allocator() { + return Allocator::create(); +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc index cdcfb017a0..416768a479 100644 --- a/lib/kernels/src/local_cuda_allocator.cc +++ b/lib/kernels/src/local_cuda_allocator.cc @@ -20,6 +20,10 @@ void LocalCudaAllocator::deallocate(void *ptr) { } } +DeviceType LocalCudaAllocator::get_allocation_device_type() const { + return DeviceType::GPU; +} + LocalCudaAllocator::~LocalCudaAllocator() { for (void *ptr : this->ptrs) { checkCUDA(cudaFree(ptr)); @@ -27,7 +31,8 @@ LocalCudaAllocator::~LocalCudaAllocator() { } Allocator create_local_cuda_memory_allocator() { - return Allocator::create(); + Allocator allocator = Allocator::create(); + return allocator; } } // namespace FlexFlow diff --git a/lib/kernels/src/managed_ff_stream.cc b/lib/kernels/src/managed_ff_stream.cc index 7385b6cc3e..f0348aa91c 100644 --- a/lib/kernels/src/managed_ff_stream.cc +++ b/lib/kernels/src/managed_ff_stream.cc @@ -1,28 +1,36 @@ #include "kernels/managed_ff_stream.h" +#include "utils/exception.h" namespace FlexFlow { ManagedFFStream::ManagedFFStream() : stream(new ffStream_t) { - checkCUDA(cudaStreamCreate(stream)); + checkCUDA(cudaStreamCreate(this->stream)); } ManagedFFStream::ManagedFFStream(ManagedFFStream &&other) noexcept : stream(std::exchange(other.stream, nullptr)) {} ManagedFFStream &ManagedFFStream::operator=(ManagedFFStream &&other) noexcept { - std::swap(this->stream, other.stream); + if (this != &other) { + this->cleanup(); + this->stream = std::exchange(other.stream, nullptr); + } return *this; } ManagedFFStream::~ManagedFFStream() { - if (stream != nullptr) { - checkCUDA(cudaStreamDestroy(*stream)); - delete stream; + this->cleanup(); +} + +void ManagedFFStream::cleanup() { + if (this->stream != nullptr) { + checkCUDA(cudaStreamDestroy(*this->stream)); + delete this->stream; } } ffStream_t const &ManagedFFStream::raw_stream() const { - return *stream; + return *this->stream; } } // namespace FlexFlow diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index c050e887b6..9f1737240e 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -3,14 +3,15 @@ namespace FlexFlow { -ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() { - handle = new PerDeviceFFHandle; - handle->workSpaceSize = 1024 * 1024; - handle->allowTensorOpMathConversion = true; - - checkCUDNN(cudnnCreate(&handle->dnn)); - checkCUBLAS(cublasCreate(&handle->blas)); - checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize)); +ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( + size_t workSpaceSize, bool allowTensorOpMathConversion) { + this->handle = new PerDeviceFFHandle{}; + this->handle->workSpaceSize = workSpaceSize; + this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion; + + checkCUDNN(cudnnCreate(&this->handle->dnn)); + checkCUBLAS(cublasCreate(&this->handle->blas)); + checkCUDA(cudaMalloc(&this->handle->workSpace, this->handle->workSpaceSize)); } ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( @@ -19,16 +20,23 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( ManagedPerDeviceFFHandle &ManagedPerDeviceFFHandle::operator=( ManagedPerDeviceFFHandle &&other) noexcept { - std::swap(this->handle, other.handle); + if (this != &other) { + this->cleanup(); + this->handle = std::exchange(other.handle, nullptr); + } return *this; } ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() { - if (handle != nullptr) { - checkCUDNN(cudnnDestroy(handle->dnn)); - checkCUBLAS(cublasDestroy(handle->blas)); - checkCUDA(cudaFree(handle->workSpace)); - delete handle; + this->cleanup(); +} + +void ManagedPerDeviceFFHandle::cleanup() { + if (this->handle != nullptr) { + checkCUDNN(cudnnDestroy(this->handle->dnn)); + checkCUBLAS(cublasDestroy(this->handle->blas)); + checkCUDA(cudaFree(this->handle->workSpace)); + delete this->handle; } } diff --git a/lib/kernels/src/perf_metrics.cc b/lib/kernels/src/perf_metrics.cc index 2036ddd35a..61163caeae 100644 --- a/lib/kernels/src/perf_metrics.cc +++ b/lib/kernels/src/perf_metrics.cc @@ -15,8 +15,9 @@ PerfMetrics::PerfMetrics(int _train_all, double _start_time_micro, double _current_time_micro) : train_all(_train_all), train_correct(_train_correct), cce_loss(_cce_loss), - mse_loss(_mse_loss), rmse_loss(_rmse_loss), mae_loss(_mae_loss), - start_time(_start_time_micro), current_time(_current_time_micro) {} + sparse_cce_loss(_sparse_cce_loss), mse_loss(_mse_loss), + rmse_loss(_rmse_loss), mae_loss(_mae_loss), start_time(_start_time_micro), + current_time(_current_time_micro) {} float get_throughput(PerfMetrics const &m) { return m.train_all / (m.current_time - m.start_time); diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt index 00da2d0d70..066cb96753 100644 --- a/lib/kernels/test/CMakeLists.txt +++ b/lib/kernels/test/CMakeLists.txt @@ -14,6 +14,7 @@ ff_add_test_executable( cudnn cudart cublas + pcg ) set(FF_TEST_EXEC_NAME "kernels-tests") diff --git a/lib/kernels/test/src/test_accessor.cc b/lib/kernels/test/src/test_accessor.cc new file mode 100644 index 0000000000..e9611a928c --- /dev/null +++ b/lib/kernels/test/src/test_accessor.cc @@ -0,0 +1,136 @@ +#include "doctest/doctest.h" +#include "kernels/accessor.h" +#include "op-attrs/datatype_value.h" +#include "test_utils.h" + +using namespace ::FlexFlow; + +template +void check_accessor_get(GenericTensorAccessorR const &accessor, + real_type_t
expected) { + CHECK(*accessor.get
() == expected); + + if constexpr (DT == DataType::INT32) { + CHECK(*accessor.get_int32_ptr() == expected); + } else if constexpr (DT == DataType::INT64) { + CHECK(*accessor.get_int64_ptr() == expected); + } else if constexpr (DT == DataType::FLOAT) { + CHECK(*accessor.get_float_ptr() == doctest::Approx(expected)); + } else if constexpr (DT == DataType::DOUBLE) { + CHECK(*accessor.get_double_ptr() == doctest::Approx(expected)); + } else if constexpr (DT == DataType::HALF) { + CHECK(*accessor.get_half_ptr() == doctest::Approx(expected)); + } +} + +template +void run_accessor_w_test(DataTypeValue value, + real_type_t
expected, + Allocator allocator) { + TensorShape shape = make_tensor_shape_from_ff_ordered({1_n}, DT); + GenericTensorAccessorW accessor = + create_filled_accessor_w(shape, allocator, value); + check_accessor_get
(read_only_accessor_from_write_accessor(accessor), + expected); +} + +template +void run_accessor_r_test(DataTypeValue value, + real_type_t
expected, + Allocator allocator) { + TensorShape shape = make_tensor_shape_from_ff_ordered({1_n}, DT); + GenericTensorAccessorR accessor = + create_filled_accessor_r(shape, allocator, value); + check_accessor_get
(accessor, expected); +} + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test GenericTensorAccessors") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("Test GenericTensorAccessorW") { + SUBCASE("Test get methods for GenericTensorAccessorW") { + run_accessor_w_test( + make_int32_data_type_value(12345), 12345, cpu_allocator); + run_accessor_w_test( + make_int64_data_type_value(12345LL), 12345LL, cpu_allocator); + run_accessor_w_test( + make_float_data_type_value(1.23f), 1.23f, cpu_allocator); + run_accessor_w_test( + make_double_data_type_value(1.23), 1.23, cpu_allocator); + } + + SUBCASE("Test operator== and operator!= for GenericTensorAccessorW") { + TensorShape shape = + make_tensor_shape_from_ff_ordered({1_n}, DataType::INT32); + + GenericTensorAccessorW accessor1 = create_filled_accessor_w( + shape, cpu_allocator, make_int32_data_type_value(12345)); + GenericTensorAccessorW accessor2 = create_filled_accessor_w( + shape, cpu_allocator, make_int32_data_type_value(12345)); + GenericTensorAccessorW accessor3 = create_filled_accessor_w( + shape, cpu_allocator, make_int32_data_type_value(54321)); + + CHECK(accessor1 == accessor2); + CHECK(accessor1 != accessor3); + } + + SUBCASE("Test at() method for GenericTensorAccessorW") { + DataType const DT = DataType::INT32; + TensorShape shape = make_tensor_shape_from_ff_ordered({3_n, 3_n}, DT); + + GenericTensorAccessorW accessor_1 = + create_random_filled_accessor_w(shape, cpu_allocator); + GenericTensorAccessorW accessor_2 = + copy_tensor_accessor_w(accessor_1, cpu_allocator); + + CHECK(accessor_1.at
({0, 0}) == accessor_2.at
({0, 0})); + CHECK(accessor_1.at
({1, 0}) == accessor_2.at
({1, 0})); + CHECK(accessor_1.at
({2, 2}) == accessor_2.at
({2, 2})); + } + } + + SUBCASE("Test GenericTensorAccessorR") { + + SUBCASE("Test get methods for GenericTensorAccessorR") { + run_accessor_r_test( + make_int32_data_type_value(12345), 12345, cpu_allocator); + run_accessor_r_test( + make_int64_data_type_value(12345LL), 12345LL, cpu_allocator); + run_accessor_r_test( + make_float_data_type_value(1.23f), 1.23f, cpu_allocator); + run_accessor_r_test( + make_double_data_type_value(1.23), 1.23, cpu_allocator); + } + + SUBCASE("Test operator== and operator!= for GenericTensorAccessorR") { + TensorShape shape = + make_tensor_shape_from_ff_ordered({1_n}, DataType::INT32); + + GenericTensorAccessorR accessor1 = create_filled_accessor_r( + shape, cpu_allocator, make_int32_data_type_value(12345)); + GenericTensorAccessorR accessor2 = create_filled_accessor_r( + shape, cpu_allocator, make_int32_data_type_value(12345)); + GenericTensorAccessorR accessor3 = create_filled_accessor_r( + shape, cpu_allocator, make_int32_data_type_value(54321)); + + CHECK(accessor1 == accessor2); + CHECK(accessor1 != accessor3); + } + + SUBCASE("Test at() method for GenericTensorAccessorR") { + DataType const DT = DataType::INT32; + TensorShape shape = make_tensor_shape_from_ff_ordered({3_n, 3_n}, DT); + + GenericTensorAccessorR accessor_1 = + create_random_filled_accessor_r(shape, cpu_allocator); + GenericTensorAccessorR accessor_2 = + copy_tensor_accessor_r(accessor_1, cpu_allocator); + + CHECK(accessor_1.at
({0, 0}) == accessor_2.at
({0, 0})); + CHECK(accessor_1.at
({1, 0}) == accessor_2.at
({1, 0})); + CHECK(accessor_1.at
({2, 2}) == accessor_2.at
({2, 2})); + } + } + } +} diff --git a/lib/kernels/test/src/test_array_shape.cc b/lib/kernels/test/src/test_array_shape.cc new file mode 100644 index 0000000000..7ede1791ef --- /dev/null +++ b/lib/kernels/test/src/test_array_shape.cc @@ -0,0 +1,105 @@ +#include "doctest/doctest.h" +#include "kernels/array_shape.h" +#include "test_utils.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test ArrayShape") { + ArrayShape shape({1_n, 2_n, 3_n, 4_n}); + + SUBCASE("Test get_volume() and num_elements()") { + CHECK(shape.get_volume() == 1 * 2 * 3 * 4); + CHECK(shape.num_elements() == 1 * 2 * 3 * 4); + } + + SUBCASE("Test num_dims() and get_dim()") { + CHECK(shape.num_dims() == 4); + CHECK(shape.get_dim() == 4); + } + + SUBCASE("Test operator[] and at()") { + CHECK(shape[legion_dim_t{0_n}] == 1); + CHECK(shape[legion_dim_t{1_n}] == 2); + CHECK(shape[legion_dim_t{2_n}] == 3); + CHECK(shape[legion_dim_t{3_n}] == 4); + + CHECK(shape.at(legion_dim_t{0_n}) == 1); + CHECK(shape.at(legion_dim_t{1_n}) == 2); + CHECK(shape.at(legion_dim_t{2_n}) == 3); + CHECK(shape.at(legion_dim_t{3_n}) == 4); + + CHECK(shape.at(ff_dim_t{0_n}) == 4); + CHECK(shape.at(ff_dim_t{1_n}) == 3); + CHECK(shape.at(ff_dim_t{2_n}) == 2); + CHECK(shape.at(ff_dim_t{3_n}) == 1); + } + + SUBCASE("Test operator== and operator!=") { + ArrayShape shape2({1_n, 2_n, 3_n, 4_n}); + ArrayShape shape3({1_n, 2_n, 3_n, 5_n}); + + CHECK(shape == shape2); + CHECK(shape != shape3); + } + + SUBCASE("Test last_idx()") { + CHECK(shape.last_idx() == legion_dim_t{3_n}); + + ArrayShape empty_shape(std::vector{}); + CHECK_THROWS(empty_shape.last_idx()); + } + + SUBCASE("Test neg_idx()") { + CHECK(shape.neg_idx(-1) == legion_dim_t{3_n}); + CHECK(shape.neg_idx(-2) == legion_dim_t{2_n}); + CHECK(shape.neg_idx(-3) == legion_dim_t{1_n}); + CHECK(shape.neg_idx(-4) == legion_dim_t{0_n}); + + CHECK_THROWS(shape.neg_idx(-5)); + } + + SUBCASE("Test at_maybe()") { + CHECK(shape.at_maybe(legion_dim_t{0_n}).value() == 1); + CHECK(shape.at_maybe(legion_dim_t{1_n}).value() == 2); + CHECK(shape.at_maybe(legion_dim_t{2_n}).value() == 3); + CHECK(shape.at_maybe(legion_dim_t{3_n}).value() == 4); + CHECK(!shape.at_maybe(legion_dim_t{4_n}).has_value()); + + CHECK(shape.at_maybe(ff_dim_t{0_n}).value() == 4); + CHECK(shape.at_maybe(ff_dim_t{1_n}).value() == 3); + CHECK(shape.at_maybe(ff_dim_t{2_n}).value() == 2); + CHECK(shape.at_maybe(ff_dim_t{3_n}).value() == 1); + CHECK(!shape.at_maybe(ff_dim_t{4_n}).has_value()); + } + + SUBCASE("Test subshape()") { + SUBCASE("Test basic subshape") { + ArrayShape ref_shape({2_n, 3_n}); + ArrayShape subshape = + shape.sub_shape(legion_dim_t{1_n}, legion_dim_t{3_n}); + + CHECK(ref_shape == subshape); + } + + SUBCASE("Test empty subshape") { + ArrayShape ref_shape(std::vector{}); + ArrayShape subshape = + shape.sub_shape(legion_dim_t{0_n}, legion_dim_t{0_n}); + CHECK(ref_shape == subshape); + } + + SUBCASE("Test subshape with no start") { + ArrayShape ref_shape({1_n, 2_n, 3_n}); + ArrayShape subshape = shape.sub_shape(std::nullopt, legion_dim_t{3_n}); + CHECK(ref_shape == subshape); + } + + SUBCASE("Test subshape with no end") { + ArrayShape ref_shape({2_n, 3_n, 4_n}); + ArrayShape subshape = shape.sub_shape(legion_dim_t{1_n}, std::nullopt); + CHECK(ref_shape == subshape); + } + } + } +} diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index 64264f6c39..6b54554a9b 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -19,7 +19,9 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int kvSeqLength = 20_n; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -39,16 +41,16 @@ TEST_SUITE(FF_TEST_SUITE) { /*kvSeqLength=*/kvSeqLength.unwrap_nonnegative(), /*add_bias_kv=*/false); - TensorShape query_shape = make_float_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, qSize}); - TensorShape key_shape = make_float_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, kSize}); - TensorShape value_shape = make_float_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, vSize}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, oProjSize}); - TensorShape weight_shape = make_float_tensor_shape_from_legion_dims( - {nonnegative_int{state.weightSize}}); + TensorShape query_shape = make_tensor_shape_from_ff_ordered( + {qoSeqLength, num_samples, qSize}, DataType::FLOAT); + TensorShape key_shape = make_tensor_shape_from_ff_ordered( + {kvSeqLength, num_samples, kSize}, DataType::FLOAT); + TensorShape value_shape = make_tensor_shape_from_ff_ordered( + {kvSeqLength, num_samples, vSize}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_ff_ordered( + {qoSeqLength, num_samples, oProjSize}, DataType::FLOAT); + TensorShape weight_shape = make_tensor_shape_from_ff_ordered( + {nonnegative_int{state.weightSize}}, DataType::FLOAT); GenericTensorAccessorW query_accessor = create_random_filled_accessor_w(query_shape, allocator); @@ -72,9 +74,7 @@ TEST_SUITE(FF_TEST_SUITE) { weight_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index cacd5b60fb..ba9b3ac0e2 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -15,16 +15,18 @@ TEST_SUITE(FF_TEST_SUITE) { int seq_length = -1; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape_a = - make_float_tensor_shape_from_legion_dims({m, k, batch}); + make_tensor_shape_from_ff_ordered({m, k, batch}, DataType::FLOAT); TensorShape input_shape_b = - make_float_tensor_shape_from_legion_dims({k, n, batch}); + make_tensor_shape_from_ff_ordered({k, n, batch}, DataType::FLOAT); TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({m, n, batch}); + make_tensor_shape_from_ff_ordered({m, n, batch}, DataType::FLOAT); GenericTensorAccessorW a_accessor = create_random_filled_accessor_w(input_shape_a, allocator); diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index b4c43cf1d8..698a320a69 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/batch_norm_kernels.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -12,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int output_w = 10_n; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -26,25 +29,25 @@ TEST_SUITE(FF_TEST_SUITE) { /*output_w=*/output_w.unwrap_nonnegative(), /*relu=*/true); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape scale_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape bias_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); + TensorShape input_shape = make_tensor_shape_from_ff_ordered( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_ff_ordered( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); + TensorShape scale_shape = make_tensor_shape_from_ff_ordered( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); + TensorShape bias_shape = make_tensor_shape_from_ff_ordered( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); - GenericTensorAccessorW scale_accessor = - create_filled_accessor_w(scale_shape, allocator, 1.0f); + GenericTensorAccessorW scale_accessor = create_filled_accessor_w( + scale_shape, allocator, make_float_data_type_value(1)); SUBCASE("forward_kernel") { - GenericTensorAccessorW bias_accessor = - create_filled_accessor_w(bias_shape, allocator, 0.0f); + GenericTensorAccessorW bias_accessor = create_filled_accessor_w( + bias_shape, allocator, make_float_data_type_value(0)); Kernels::BatchNorm::forward_kernel( /*stream=*/managed_stream.raw_stream(), @@ -54,10 +57,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*scale_ptr=*/scale_accessor.get_float_ptr(), /*bias_ptr=*/bias_accessor.get_float_ptr()); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { @@ -73,9 +73,9 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::BatchNorm::backward_kernel( /*stream=*/managed_stream.raw_stream(), /*per_device_state=*/state, - /*input_ptr=*/input_accessor.get_float_ptr(), - /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(), /*output_ptr=*/output_accessor.get_float_ptr(), + /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(), + /*input_ptr=*/input_accessor.get_float_ptr(), /*input_grad_ptr=*/input_grad_accessor.get_float_ptr(), /*scale_ptr=*/scale_accessor.get_float_ptr(), /*scale_grad_ptr=*/scale_grad_accessor.get_float_ptr(), @@ -83,19 +83,9 @@ TEST_SUITE(FF_TEST_SUITE) { /*numElements=*/ input_accessor.shape.num_elements().unwrap_nonnegative()); - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - std::vector host_scale_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(scale_grad_accessor)); - std::vector host_bias_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(bias_grad_accessor)); - - CHECK(contains_non_zero(host_input_grad_data)); - CHECK(contains_non_zero(host_scale_grad_data)); - CHECK(contains_non_zero(host_bias_grad_data)); + CHECK(contains_non_zero(input_grad_accessor)); + CHECK(contains_non_zero(scale_grad_accessor)); + CHECK(contains_non_zero(bias_grad_accessor)); } Kernels::BatchNorm::cleanup_kernel(allocator, diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 0e0769014d..d314a6bcc2 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -1,7 +1,7 @@ #include "doctest/doctest.h" #include "kernels/cast_kernels.h" +#include "kernels/cast_kernels_cpu.h" #include "test_utils.h" -#include using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { @@ -11,46 +11,68 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({100_n, 100_n}); + make_tensor_shape_from_ff_ordered({100_n, 100_n}, DataType::FLOAT); TensorShape output_shape = - make_double_tensor_shape_from_legion_dims({100_n, 100_n}); - - GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + make_tensor_shape_from_ff_ordered({100_n, 100_n}, DataType::DOUBLE); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - - Kernels::Cast::forward_kernel(managed_stream.raw_stream(), - input_accessor, - output_accessor, - DataType::FLOAT, - DataType::DOUBLE); + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); - std::vector host_double_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor, output_accessor); - CHECK(contains_non_zero(host_double_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { + GenericTensorAccessorR grad_output_accessor = + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW grad_input_accessor = - allocator.allocate_tensor(input_shape); - - Kernels::Cast::backward_kernel( - managed_stream.raw_stream(), - read_only_accessor_from_write_accessor(output_accessor), - grad_input_accessor, - DataType::DOUBLE, - DataType::FLOAT); - - std::vector host_grad_float_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(grad_input_accessor)); - CHECK(contains_non_zero(host_grad_float_data)); + create_zero_filled_accessor_w(input_shape, allocator); + + Kernels::Cast::backward_kernel(managed_stream.raw_stream(), + grad_output_accessor, + grad_input_accessor); + + CHECK(contains_non_zero(grad_input_accessor)); + } + } + + TEST_CASE("Check Cast Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = + make_tensor_shape_from_ff_ordered({10_n, 2_n}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_ff_ordered({10_n, 2_n}, DataType::DOUBLE); + + // Only calling forward kernel as backward kernel is exactly the same + SUBCASE("forward_kernel") { + // Run GPU Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Cast::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessor_data_is_equal(output_accessor_gpu, output_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 2b6b9bf589..3a7a70c862 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -1,39 +1,37 @@ #include "doctest/doctest.h" #include "kernels/combine_kernels.h" +#include "kernels/combine_kernels_cpu.h" #include "test_utils.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test combine kernel") { - ManagedPerDeviceFFHandle managed_handle{}; + TEST_CASE("Call Combine Forward and Backward Kernels") { + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({100_n, 100_n}); + make_tensor_shape_from_ff_ordered({100_n, 100_n}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Combine::forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -41,9 +39,64 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor, input_grad_accessor); - std::vector host_input_grad = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad)); + CHECK(contains_non_zero(input_grad_accessor)); + } + } + + TEST_CASE("Check Combine Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = + make_tensor_shape_from_ff_ordered({5_n, 5_n}, DataType::FLOAT); + TensorShape output_shape = input_shape; + + SUBCASE("forward_kernel") { + // Run GPU Combine Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(output_shape); + + Kernels::Combine::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Combine Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(output_shape); + + Kernels::Combine::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessor_data_is_equal(output_accessor_gpu, output_accessor_cpu)); + } + + SUBCASE("backward_kernel") { + // Run GPU Combine Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Combine::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu); + + // Run CPU Combine Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu, + input_grad_accessor_cpu); + + CHECK(accessor_data_is_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 215e599716..f8bc31c3d5 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -6,51 +6,96 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test concat kernel forward and backward") { - nonnegative_int num_inputs = 3_n; - nonnegative_int size_per_input = 100_n; - ff_dim_t concat_axis = ff_dim_t{0_n}; - - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({size_per_input}); - TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({size_per_input, num_inputs}); - Allocator allocator = create_local_cuda_memory_allocator(); + const nonnegative_int num_inputs = 4_n; + SUBCASE("forward_kernel") { - std::vector input_accessors = - repeat(num_inputs, [&]() { - return read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - }); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); - - Kernels::Concat::forward_kernel(managed_stream.raw_stream(), - output_accessor, - input_accessors, - concat_axis); - - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - CHECK(contains_non_zero(host_output_data)); + auto run_forward_test = [&](nonnegative_int input_rows, + nonnegative_int input_cols, + TensorShape output_shape, + ff_dim_t concat_axis) { + TensorShape input_shape = make_tensor_shape_from_ff_ordered( + {input_rows, input_cols}, DataType::FLOAT); + + std::vector input_accessors = + repeat(num_inputs, [&]() { + return create_random_filled_accessor_r(input_shape, allocator); + }); + + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); + + Kernels::Concat::forward_kernel(managed_stream.raw_stream(), + output_accessor, + input_accessors, + concat_axis); + + CHECK(contains_non_zero(output_accessor)); + }; + + SUBCASE("test forward concat, axis = 0") { + nonnegative_int input_rows = 2_n; + nonnegative_int input_cols = 4_n; + TensorShape output_shape = make_tensor_shape_from_ff_ordered( + {num_inputs * input_rows, input_cols}, DataType::FLOAT); + run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n}); + } + + SUBCASE("test forward concat, axis = 1") { + nonnegative_int input_rows = 4_n; + nonnegative_int input_cols = 2_n; + TensorShape output_shape = make_tensor_shape_from_ff_ordered( + {input_rows, num_inputs * input_cols}, DataType::FLOAT); + run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n}); + } } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); - std::vector input_grad_accessors = repeat( - num_inputs, [&]() { return allocator.allocate_tensor(input_shape); }); - Kernels::Concat::backward_kernel(managed_stream.raw_stream(), - output_grad_accessor, - input_grad_accessors, - concat_axis); + auto run_backward_test = [&](nonnegative_int input_rows, + nonnegative_int input_cols, + TensorShape output_shape, + ff_dim_t concat_axis) { + TensorShape input_shape = make_tensor_shape_from_ff_ordered( + {input_rows, input_cols}, DataType::FLOAT); + + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); + + std::vector input_grad_accessors = + repeat(num_inputs, [&]() { + return create_zero_filled_accessor_w(input_shape, allocator); + }); + + Kernels::Concat::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor, + input_grad_accessors, + concat_axis); + + for (auto &accessor : input_grad_accessors) { + CHECK(contains_non_zero(accessor)); + } + }; + + SUBCASE("test backward concat, axis = 0") { + nonnegative_int input_rows = 2_n; + nonnegative_int input_cols = 4_n; + TensorShape output_shape = make_tensor_shape_from_ff_ordered( + {num_inputs * input_rows, input_cols}, DataType::FLOAT); + run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n}); + } + + SUBCASE("test backward concat, axis = 1") { + nonnegative_int input_rows = 4_n; + nonnegative_int input_cols = 2_n; + TensorShape output_shape = make_tensor_shape_from_ff_ordered( + {input_rows, num_inputs * input_cols}, DataType::FLOAT); + run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n}); + } } } } diff --git a/lib/kernels/test/src/test_copy_tensor_accessor.cc b/lib/kernels/test/src/test_copy_tensor_accessor.cc new file mode 100644 index 0000000000..a6a4cfde53 --- /dev/null +++ b/lib/kernels/test/src/test_copy_tensor_accessor.cc @@ -0,0 +1,76 @@ +#include "doctest/doctest.h" +#include "kernels/accessor.h" +#include "op-attrs/datatype_value.h" +#include "test_utils.h" + +using namespace ::FlexFlow; +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test copy_tensor_accessor") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + TensorShape shape = + make_tensor_shape_from_ff_ordered({5_n, 5_n}, DataType::FLOAT); + + SUBCASE("Test copy_tensor_accessor_r") { + GenericTensorAccessorR src_accessor = + create_random_filled_accessor_r(shape, cpu_allocator); + GenericTensorAccessorR dst_accessor = + copy_tensor_accessor_r(src_accessor, cpu_allocator); + + CHECK(accessor_data_is_equal(src_accessor, dst_accessor)); + } + + SUBCASE("Test copy_tensor_accessor_w") { + GenericTensorAccessorW src_accessor = + create_random_filled_accessor_w(shape, cpu_allocator); + GenericTensorAccessorW dst_accessor = + copy_tensor_accessor_w(src_accessor, cpu_allocator); + + CHECK(accessor_data_is_equal(src_accessor, dst_accessor)); + } + + SUBCASE("Test copy_accessor_r_to_cpu_if_necessary") { + SUBCASE("Test necessary") { + GenericTensorAccessorR src_accessor = + create_random_filled_accessor_r(shape, gpu_allocator); + GenericTensorAccessorR dst_accessor = + copy_accessor_r_to_cpu_if_necessary(src_accessor, cpu_allocator); + + CHECK(accessor_data_is_equal(src_accessor, dst_accessor)); + CHECK(dst_accessor.device_type == DeviceType::CPU); + } + + SUBCASE("Test not necessary") { + GenericTensorAccessorR src_accessor = + create_random_filled_accessor_r(shape, cpu_allocator); + GenericTensorAccessorR dst_accessor = + copy_accessor_r_to_cpu_if_necessary(src_accessor, cpu_allocator); + + CHECK(accessor_data_is_equal(src_accessor, dst_accessor)); + CHECK(dst_accessor.device_type == DeviceType::CPU); + } + } + + SUBCASE("Test copy_accessor_w_to_cpu_if_necessary") { + SUBCASE("Test necessary") { + GenericTensorAccessorW src_accessor = + create_random_filled_accessor_w(shape, gpu_allocator); + GenericTensorAccessorW dst_accessor = + copy_accessor_w_to_cpu_if_necessary(src_accessor, cpu_allocator); + + CHECK(accessor_data_is_equal(src_accessor, dst_accessor)); + CHECK(dst_accessor.device_type == DeviceType::CPU); + } + + SUBCASE("Test not necessary") { + GenericTensorAccessorW src_accessor = + create_random_filled_accessor_w(shape, cpu_allocator); + GenericTensorAccessorW dst_accessor = + copy_accessor_w_to_cpu_if_necessary(src_accessor, cpu_allocator); + + CHECK(accessor_data_is_equal(src_accessor, dst_accessor)); + CHECK(dst_accessor.device_type == DeviceType::CPU); + } + } + } +} diff --git a/lib/kernels/test/src/test_datatype_dispatch.cc b/lib/kernels/test/src/test_datatype_dispatch.cc new file mode 100644 index 0000000000..41737d715a --- /dev/null +++ b/lib/kernels/test/src/test_datatype_dispatch.cc @@ -0,0 +1,65 @@ +#include "doctest/doctest.h" +#include "kernels/datatype_dispatch.h" + +using namespace ::FlexFlow; + +template +struct TestDatatypeDispatch1 { + int operator()(int value) { + if (DT == DataType::FLOAT) { + return value + 1; + } else if (DT == DataType::INT32) { + return value + 2; + } else { + return value + 3; + } + } +}; + +template +struct TestDatatypeDispatch2 { + void operator()(int &value) { + if (IDT == DataType::INT32 && ODT == DataType::FLOAT) { + value *= 2; + } else if (IDT == DataType::FLOAT && ODT == DataType::INT32) { + value *= 3; + } else { + value *= 4; + } + } +}; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test DataTypeDispatch") { + SUBCASE("Test DataTypeDispatch1") { + CHECK(DataTypeDispatch1{}(DataType::FLOAT, 1) == + 2); + CHECK(DataTypeDispatch1{}(DataType::INT32, 1) == + 3); + CHECK(DataTypeDispatch1{}(DataType::DOUBLE, 1) == + 4); + } + + SUBCASE("Test DataTypeDispatch2") { + int value = 1; + + SUBCASE("Case One") { + DataTypeDispatch2{}( + DataType::INT32, DataType::FLOAT, value); + CHECK(value == 2); + } + + SUBCASE("Case Two") { + DataTypeDispatch2{}( + DataType::FLOAT, DataType::INT32, value); + CHECK(value == 3); + } + + SUBCASE("Test Three") { + DataTypeDispatch2{}( + DataType::DOUBLE, DataType::DOUBLE, value); + CHECK(value == 4); + } + } + } +} diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 86f8f2102b..e5eba341f3 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -14,25 +14,22 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10_n, 10_n}); + make_tensor_shape_from_ff_ordered({10_n, 10_n}, DataType::FLOAT); TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); DropoutPerDeviceState state = Kernels::Dropout::init_kernel( managed_handle.raw_handle(), dropout_rate, seed, shape, allocator); - auto get_zero_count = [](std::vector const &data) { - return count(data, [](float x) { return x == 0.0f; }); - }; - SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -41,11 +38,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_accessor = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - CHECK(contains_non_zero(host_output_accessor)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 83f7f0445e..ee4554d00a 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/flat_kernels.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -7,15 +8,18 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Flat Kernel") { Allocator allocator = create_local_cuda_memory_allocator(); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); TensorShape output_shape = input_shape; GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 2.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(2))); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -25,33 +29,21 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor, output_accessor.get_float_ptr()); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 2.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 0.0f); - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(0)); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(1)); Kernels::Flat::backward_kernel(managed_stream.raw_stream(), input_accessor, - input_grad_accessor.get_float_ptr(), - output_grad_accessor.get_float_ptr()); - - std::vector backward_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor.get_float_ptr(), + input_grad_accessor.get_float_ptr()); - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(backward_output_data == expected_output_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 1a8cf5f82a..64cc824b9b 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -3,59 +3,87 @@ #include "test_utils.h" using namespace ::FlexFlow; + TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Gather Forward and Backward Kernel") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - Allocator allocator = create_local_cuda_memory_allocator(); GatherPerDeviceState state = {managed_handle.raw_handle(), - legion_dim_t{2_n}}; + legion_dim_t{0_n}}; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n}); + SUBCASE("forward_kernel") { + auto run_forward_test = [&](TensorShape input_shape, + TensorShape index_shape, + TensorShape output_shape) { + GenericTensorAccessorR input_accessor = + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); - GenericTensorAccessorR index_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + Kernels::Gather::forward_kernel(managed_stream.raw_stream(), + state, + input_accessor, + index_accessor, + output_accessor); - SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); - - Kernels::Gather::forward_kernel(managed_stream.raw_stream(), - state, - input_accessor, - index_accessor, - output_accessor); - - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); + }; + + SUBCASE("test gather forward, 2D") { + TensorShape input_shape = + make_tensor_shape_from_ff_ordered({2_n, 100_n}, DataType::FLOAT); + TensorShape index_shape = + make_tensor_shape_from_ff_ordered({2_n, 20_n}, DataType::INT32); + TensorShape output_shape = + make_tensor_shape_from_ff_ordered({2_n, 20_n}, DataType::FLOAT); + run_forward_test(input_shape, index_shape, output_shape); + } + + SUBCASE("test gather forward, 1D") { + TensorShape input_shape = + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); + TensorShape index_shape = + make_tensor_shape_from_ff_ordered({10_n}, DataType::INT32); + TensorShape output_shape = + make_tensor_shape_from_ff_ordered({10_n}, DataType::FLOAT); + run_forward_test(input_shape, index_shape, output_shape); + } } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); - GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); - - Kernels::Gather::backward_kernel(managed_stream.raw_stream(), - state, - output_grad_accessor, - index_accessor, - input_grad_accessor); - - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad_data)); + auto run_backward_test = [&](TensorShape input_shape, + TensorShape index_shape, + TensorShape output_shape) { + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW input_grad_accessor = + allocator.allocate_tensor(input_shape); + + Kernels::Gather::backward_kernel(managed_stream.raw_stream(), + state, + output_grad_accessor, + index_accessor, + input_grad_accessor); + CHECK(contains_non_zero(input_grad_accessor)); + }; + + SUBCASE("test gather backward, 2D") { + TensorShape input_shape = + make_tensor_shape_from_ff_ordered({2_n, 100_n}, DataType::FLOAT); + TensorShape index_shape = + make_tensor_shape_from_ff_ordered({2_n, 25_n}, DataType::INT32); + TensorShape output_shape = + make_tensor_shape_from_ff_ordered({2_n, 25_n}, DataType::FLOAT); + run_backward_test(input_shape, index_shape, output_shape); + } } } } diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 5386c1d943..4d5802936e 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/layer_norm_kernels.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -11,13 +12,15 @@ TEST_SUITE(FF_TEST_SUITE) { float epsilon = 1e-5f; bool elementwise_affine = true; - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({batch_size, feature_size}); + TensorShape input_shape = make_tensor_shape_from_ff_ordered( + {batch_size, feature_size}, DataType::FLOAT); TensorShape output_shape = input_shape; TensorShape feature_shape = - make_float_tensor_shape_from_legion_dims({feature_size}); + make_tensor_shape_from_ff_ordered({feature_size}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -31,16 +34,15 @@ TEST_SUITE(FF_TEST_SUITE) { epsilon); GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - GenericTensorAccessorW gamma_accessor = - create_filled_accessor_w(feature_shape, allocator, 1.0f); + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorW gamma_accessor = create_filled_accessor_w( + feature_shape, allocator, make_float_data_type_value(1)); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - GenericTensorAccessorW beta_accessor = - create_filled_accessor_w(feature_shape, allocator, 0.0f); + GenericTensorAccessorW beta_accessor = create_filled_accessor_w( + feature_shape, allocator, make_float_data_type_value(0)); Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(), state, @@ -52,8 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW gamma_grad_accessor = diff --git a/lib/kernels/test/src/test_legion_dim.cc b/lib/kernels/test/src/test_legion_dim.cc new file mode 100644 index 0000000000..c06b779ad8 --- /dev/null +++ b/lib/kernels/test/src/test_legion_dim.cc @@ -0,0 +1,29 @@ +#include "doctest/doctest.h" +#include "kernels/legion_dim.h" + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test LegionDim") { + SUBCASE("Test add_to_legion_dim") { + legion_dim_t dim{1_n}; + CHECK(add_to_legion_dim(dim, 2) == legion_dim_t{3_n}); + } + + SUBCASE("Test legion_dim_from_ff_dim") { + CHECK(legion_dim_from_ff_dim(ff_dim_t{0_n}, 4_n) == legion_dim_t{3_n}); + CHECK(legion_dim_from_ff_dim(ff_dim_t{1_n}, 4_n) == legion_dim_t{2_n}); + CHECK(legion_dim_from_ff_dim(ff_dim_t{2_n}, 4_n) == legion_dim_t{1_n}); + CHECK(legion_dim_from_ff_dim(ff_dim_t{3_n}, 4_n) == legion_dim_t{0_n}); + } + + SUBCASE("Test LegionOrdered") { + LegionOrdered legion_ordered{1, 2, 3, 4}; + + SUBCASE("Test ff_ordered_from_legion_ordered") { + CHECK(ff_ordered_from_legion_ordered(legion_ordered) == + FFOrdered{4, 3, 2, 1}); + } + } + } +} diff --git a/lib/kernels/test/src/test_local_cpu_allocator.cc b/lib/kernels/test/src/test_local_cpu_allocator.cc new file mode 100644 index 0000000000..d5552e4cb0 --- /dev/null +++ b/lib/kernels/test/src/test_local_cpu_allocator.cc @@ -0,0 +1,19 @@ +#include "doctest/doctest.h" +#include "kernels/local_cpu_allocator.h" + +using namespace ::FlexFlow; +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test LocalCPUAllocator") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("Test allocate and deallocate") { + void *ptr = cpu_allocator.allocate(100); + CHECK(ptr != nullptr); + cpu_allocator.deallocate(ptr); + } + + SUBCASE("Test get_allocation_device_type") { + CHECK(cpu_allocator.get_allocation_device_type() == DeviceType::CPU); + } + } +} diff --git a/lib/kernels/test/src/test_local_cuda_allocator.cc b/lib/kernels/test/src/test_local_cuda_allocator.cc new file mode 100644 index 0000000000..7c3e62dbeb --- /dev/null +++ b/lib/kernels/test/src/test_local_cuda_allocator.cc @@ -0,0 +1,19 @@ +#include "doctest/doctest.h" +#include "kernels/local_cuda_allocator.h" + +using namespace ::FlexFlow; +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test LocalCUDAAllocator") { + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + + SUBCASE("Test allocate and deallocate") { + void *ptr = gpu_allocator.allocate(100); + CHECK(ptr != nullptr); + gpu_allocator.deallocate(ptr); + } + + SUBCASE("Test get_allocation_device_type") { + CHECK(gpu_allocator.get_allocation_device_type() == DeviceType::GPU); + } + } +} diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc new file mode 100644 index 0000000000..841c9a82ab --- /dev/null +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -0,0 +1,89 @@ +#include "doctest/doctest.h" +#include "kernels/gather_kernels.h" +#include "test_utils.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test ManagedFFStream") { + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + Allocator allocator = create_local_cuda_memory_allocator(); + + GatherPerDeviceState state = {managed_handle.raw_handle(), + legion_dim_t{0_n}}; + + SUBCASE("forward_kernel") { + auto run_forward_test = [&](TensorShape input_shape, + TensorShape index_shape, + TensorShape output_shape) { + GenericTensorAccessorR input_accessor = + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); + + Kernels::Gather::forward_kernel(managed_stream.raw_stream(), + state, + input_accessor, + index_accessor, + output_accessor); + + CHECK(contains_non_zero(output_accessor)); + }; + + SUBCASE("test gather forward, 2D") { + TensorShape input_shape = + make_tensor_shape_from_ff_ordered({2_n, 100_n}, DataType::FLOAT); + TensorShape index_shape = + make_tensor_shape_from_ff_ordered({2_n, 20_n}, DataType::INT32); + TensorShape output_shape = + make_tensor_shape_from_ff_ordered({2_n, 20_n}, DataType::FLOAT); + run_forward_test(input_shape, index_shape, output_shape); + } + + SUBCASE("test gather forward, 1D") { + TensorShape input_shape = + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); + TensorShape index_shape = + make_tensor_shape_from_ff_ordered({10_n}, DataType::INT32); + TensorShape output_shape = + make_tensor_shape_from_ff_ordered({10_n}, DataType::FLOAT); + run_forward_test(input_shape, index_shape, output_shape); + } + } + + SUBCASE("backward_kernel") { + auto run_backward_test = [&](TensorShape input_shape, + TensorShape index_shape, + TensorShape output_shape) { + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW input_grad_accessor = + allocator.allocate_tensor(input_shape); + + Kernels::Gather::backward_kernel(managed_stream.raw_stream(), + state, + output_grad_accessor, + index_accessor, + input_grad_accessor); + CHECK(contains_non_zero(input_grad_accessor)); + }; + + SUBCASE("test gather backward, 2D") { + TensorShape input_shape = + make_tensor_shape_from_ff_ordered({2_n, 100_n}, DataType::FLOAT); + TensorShape index_shape = + make_tensor_shape_from_ff_ordered({2_n, 25_n}, DataType::INT32); + TensorShape output_shape = + make_tensor_shape_from_ff_ordered({2_n, 25_n}, DataType::FLOAT); + run_backward_test(input_shape, index_shape, output_shape); + } + } + } +} diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc new file mode 100644 index 0000000000..5902664a14 --- /dev/null +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -0,0 +1,37 @@ +#include "doctest/doctest.h" +#include "kernels/managed_per_device_ff_handle.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test ManagedPerDeviceFFHandle") { + ManagedPerDeviceFFHandle base_handle{/*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); + + SUBCASE("constructor") { + CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024); + CHECK(base_handle.raw_handle().allowTensorOpMathConversion == true); + } + + SUBCASE("move constructor") { + ManagedPerDeviceFFHandle new_handle(std::move(base_handle)); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } + + SUBCASE("move assignment operator") { + SUBCASE("move assign to other") { + ManagedPerDeviceFFHandle new_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + new_handle = std::move(base_handle); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } + + SUBCASE("move assign to self") { + base_handle = std::move(base_handle); + CHECK(&base_handle.raw_handle() == base_handle_ptr); + } + } + } +} diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 4fd1b53210..e9fab697bb 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -1,12 +1,15 @@ #include "doctest/doctest.h" #include "kernels/partition_kernels.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Partition Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -15,47 +18,33 @@ TEST_SUITE(FF_TEST_SUITE) { managed_handle.raw_handle(), DataType::FLOAT); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10_n, 10_n}); + make_tensor_shape_from_ff_ordered({10_n, 10_n}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + GenericTensorAccessorR input_accessor = create_filled_accessor_r( + input_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Repartition::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(1)); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(2)); Kernels::Repartition::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); - - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor, + input_grad_accessor); - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f); - CHECK(host_grad_input_data == expected_grad_input_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_perf_metrics.cc b/lib/kernels/test/src/test_perf_metrics.cc new file mode 100644 index 0000000000..045788bae3 --- /dev/null +++ b/lib/kernels/test/src/test_perf_metrics.cc @@ -0,0 +1,127 @@ +#include "doctest/doctest.h" +#include "kernels/perf_metrics.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test PerfMetrics Constructors and Metric Functions") { + SUBCASE("Test constructor with start_time only") { + double start = 100.0; + PerfMetrics pm(start); + + CHECK(pm.start_time == start); + CHECK(pm.current_time == start); + + CHECK(pm.train_all == 0); + if (pm.train_correct.has_value()) { + CHECK(pm.train_correct.value() == 0); + } + + CHECK(!pm.cce_loss.has_value()); + + if (pm.sparse_cce_loss.has_value()) { + CHECK(pm.sparse_cce_loss.value() == doctest::Approx(0.0f)); + } + if (pm.mse_loss.has_value()) { + CHECK(pm.mse_loss.value() == doctest::Approx(0.0f)); + } + if (pm.rmse_loss.has_value()) { + CHECK(pm.rmse_loss.value() == doctest::Approx(0.0f)); + } + if (pm.mae_loss.has_value()) { + CHECK(pm.mae_loss.value() == doctest::Approx(0.0f)); + } + } + + SUBCASE("Test full constructor and throughput/accuracy") { + int train_all = 200; + int train_correct = 150; + float cce = 1.2f; + float sparse_cce = 1.0f; + float mse = 0.5f; + float rmse = 0.7f; + float mae = 0.3f; + double start = 100.0; + double curr = 110.0; + PerfMetrics pm(train_all, + train_correct, + cce, + sparse_cce, + mse, + rmse, + mae, + start, + curr); + + CHECK(pm.train_all == train_all); + CHECK(pm.train_correct.has_value()); + CHECK(pm.train_correct.value() == train_correct); + CHECK(pm.cce_loss.has_value()); + CHECK(pm.cce_loss.value() == doctest::Approx(cce)); + CHECK(pm.sparse_cce_loss.has_value()); + CHECK(pm.sparse_cce_loss.value() == doctest::Approx(sparse_cce)); + CHECK(pm.mse_loss.has_value()); + CHECK(pm.mse_loss.value() == doctest::Approx(mse)); + CHECK(pm.rmse_loss.has_value()); + CHECK(pm.rmse_loss.value() == doctest::Approx(rmse)); + CHECK(pm.mae_loss.has_value()); + CHECK(pm.mae_loss.value() == doctest::Approx(mae)); + CHECK(pm.start_time == start); + CHECK(pm.current_time == curr); + + float expected_throughput = train_all / (curr - start); + CHECK(get_throughput(pm) == doctest::Approx(expected_throughput)); + + float expected_accuracy = static_cast(train_correct) / train_all; + CHECK(get_accuracy(pm) == doctest::Approx(expected_accuracy)); + } + + SUBCASE("Test update function") { + PerfMetrics pm1(100, 50, 1.0f, 0.5f, 0.3f, 0.2f, 0.1f, 0.0, 1.0); + PerfMetrics pm2(50, 30, 0.5f, 0.3f, 0.2f, 0.1f, 0.05f, 0.0, 1.5); + + PerfMetrics updated = update(pm1, pm2); + + CHECK(updated.train_all == (100 + 50)); + if (updated.train_correct.has_value()) { + CHECK(updated.train_correct.value() == (50 + 30)); + } + + CHECK(updated.cce_loss.has_value()); + CHECK(updated.cce_loss.value() == doctest::Approx(1.0f + 0.5f)); + CHECK(updated.sparse_cce_loss.has_value()); + CHECK(updated.sparse_cce_loss.value() == doctest::Approx(0.5f + 0.3f)); + CHECK(updated.mse_loss.has_value()); + CHECK(updated.mse_loss.value() == doctest::Approx(0.3f + 0.2f)); + CHECK(updated.rmse_loss.has_value()); + CHECK(updated.rmse_loss.value() == doctest::Approx(0.2f + 0.1f)); + CHECK(updated.mae_loss.has_value()); + CHECK(updated.mae_loss.value() == doctest::Approx(0.1f + 0.05f)); + CHECK(updated.current_time == pm2.current_time); + } + + SUBCASE("Test apply_scale function") { + PerfMetrics pm(100, 50, 2.0f, 1.0f, 0.8f, 0.6f, 0.4f, 0.0, 2.0); + float scale = 0.5f; + PerfMetrics scaled = apply_scale(pm, scale); + + CHECK(scaled.cce_loss.has_value()); + CHECK(scaled.cce_loss.value() == doctest::Approx(2.0f * scale)); + CHECK(scaled.sparse_cce_loss.has_value()); + CHECK(scaled.sparse_cce_loss.value() == doctest::Approx(1.0f * scale)); + CHECK(scaled.mse_loss.has_value()); + CHECK(scaled.mse_loss.value() == doctest::Approx(0.8f * scale)); + CHECK(scaled.rmse_loss.has_value()); + CHECK(scaled.rmse_loss.value() == doctest::Approx(0.6f * scale)); + CHECK(scaled.mae_loss.has_value()); + CHECK(scaled.mae_loss.value() == doctest::Approx(0.4f * scale)); + + CHECK(scaled.train_all == pm.train_all); + if (scaled.train_correct.has_value()) { + CHECK(scaled.train_correct.value() == pm.train_correct.value()); + } + CHECK(scaled.start_time == pm.start_time); + CHECK(scaled.current_time == pm.current_time); + } + } +} diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 62b61707c6..06db1989eb 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/pool_2d_kernels.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -22,7 +23,9 @@ TEST_SUITE(FF_TEST_SUITE) { PoolOp pool_type = PoolOp::MAX; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -46,10 +49,10 @@ TEST_SUITE(FF_TEST_SUITE) { /*stride_w=*/stride_w.unwrap_nonnegative(), /*pool_type=*/pool_type); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {input_w, input_h, input_c, input_n}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {output_w, output_h, output_c, output_n}); + TensorShape input_shape = make_tensor_shape_from_ff_ordered( + {input_w, input_h, input_c, input_n}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_ff_ordered( + {output_w, output_h, output_c, output_n}, DataType::FLOAT); GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); @@ -62,28 +65,23 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.ptr, output_accessor.ptr); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w( + output_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Pool2D::backward_kernel(managed_stream.raw_stream(), state, - input_accessor.ptr, - input_grad_accessor.ptr, output_accessor.ptr, - output_grad_accessor.ptr); + output_grad_accessor.ptr, + input_accessor.ptr, + input_grad_accessor.ptr); - std::vector host_input_grad = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 04a3817b84..921a5ff08c 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/reduction_kernels.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -7,21 +8,22 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Reduction Forward and Backward Kernel") { std::size_t num_replicas = 5; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {10_n, 10_n, 10_n, 10_n, 10_n}); + TensorShape input_shape = make_tensor_shape_from_ff_ordered( + {10_n, 10_n, 10_n, 10_n, 10_n}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({10_n}); + make_tensor_shape_from_ff_ordered({10_n}, DataType::FLOAT); GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -30,30 +32,22 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor, num_replicas); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { TensorShape output_shape = input_shape; - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Reduction::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor, - output_grad_accessor); - - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - std::vector host_grad_data = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(host_grad_data == expected_grad_input_data); + output_grad_accessor, + input_grad_accessor); + + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index fa726898f2..b2c8ea0c19 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -1,55 +1,115 @@ #include "doctest/doctest.h" #include "kernels/replicate_kernels.h" +#include "kernels/replicate_kernels_cpu.h" #include "test_utils.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test Replicate Kernel") { + TEST_CASE("Call Replicate Forward and Backward Kernels") { nonnegative_int num_replicas = 10_n; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); - TensorShape output_shape = input_shape; + TensorShape input_shape = + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Replicate::forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_random_filled_accessor_r(output_shape, allocator); + GenericTensorAccessorW input_grad_accessor = + allocator.allocate_tensor(input_shape); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor, output_grad_accessor, + input_grad_accessor, + num_replicas.unwrap_nonnegative()); + + CHECK(contains_non_zero(input_grad_accessor)); + } + } + + TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") { + nonnegative_int num_replicas = 2_n; + + TensorShape input_shape = + make_tensor_shape_from_ff_ordered({5_n}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_ff_ordered({num_replicas, 5_n}, DataType::FLOAT); + + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("forward_kernel") { + // Run GPU Replicate Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Replicate::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Replicate Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessor_data_is_equal(output_accessor_gpu, output_accessor_cpu)); + } + + SUBCASE("backward_kernel") { + // Run GPU Replicate Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu, num_replicas.unwrap_nonnegative()); - std::vector check_aggregated_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(check_aggregated_data)); + // Run CPU Replicate Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Replicate::cpu_backward_kernel( + output_grad_accessor_cpu, + input_grad_accessor_cpu, + num_replicas.unwrap_nonnegative()); + + CHECK(accessor_data_is_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index d329a347b3..fa67953947 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -5,12 +5,15 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Reshape Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); TensorShape output_shape = input_shape; ReshapePerDeviceState state = @@ -18,42 +21,28 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Reshape::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + allocator.allocate_tensor(input_shape); Kernels::Reshape::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); - - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor, + input_grad_accessor); - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f); - CHECK(host_grad_input_data == expected_grad_input_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 9c8475f6d6..01eded4297 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -1,26 +1,31 @@ #include "doctest/doctest.h" #include "kernels/reverse_kernels.h" +#include "kernels/reverse_kernels_cpu.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Call Reverse Forward and Backward Kernels") { + nonnegative_int num_out_blks = 1_n; nonnegative_int reverse_dim_size = 10_n; nonnegative_int in_blk_size = 10_n; - nonnegative_int num_out_blks = 1_n; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = make_tensor_shape_from_ff_ordered( + {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(1))); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -33,17 +38,14 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size.unwrap_nonnegative(), input_accessor.shape.num_elements().unwrap_nonnegative()); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(check_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + allocator.allocate_tensor(input_shape); Kernels::Reverse::backward_kernel( managed_stream.raw_stream(), @@ -54,10 +56,91 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size.unwrap_nonnegative(), input_grad_accessor.shape.num_elements().unwrap_nonnegative()); - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_grad_input_data)); + CHECK(contains_non_zero(input_grad_accessor)); + } + } + + TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { + nonnegative_int num_out_blks = 1_n; + nonnegative_int reverse_dim_size = 4_n; + nonnegative_int in_blk_size = 3_n; + + TensorShape input_shape = make_tensor_shape_from_ff_ordered( + {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); + TensorShape output_shape = input_shape; + + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("forward_kernel") { + // Run GPU Cast Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Reverse::forward_kernel( + managed_stream.raw_stream(), + input_accessor_gpu.get_float_ptr(), + output_accessor_gpu.get_float_ptr(), + num_out_blks.unwrap_nonnegative(), + reverse_dim_size.unwrap_nonnegative(), + in_blk_size.unwrap_nonnegative(), + input_accessor_gpu.shape.num_elements().unwrap_nonnegative()); + + // Run CPU Cast Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel( + input_accessor_cpu, + output_accessor_cpu, + num_out_blks.unwrap_nonnegative(), + reverse_dim_size.unwrap_nonnegative(), + in_blk_size.unwrap_nonnegative()); + + CHECK(accessor_data_is_equal(output_accessor_cpu, output_accessor_cpu)); + } + + SUBCASE("backward_kernel") { + // Run GPU Cast Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Reverse::backward_kernel( + managed_stream.raw_stream(), + output_grad_accessor_gpu.get_float_ptr(), + input_grad_accessor_gpu.get_float_ptr(), + num_out_blks.unwrap_nonnegative(), + reverse_dim_size.unwrap_nonnegative(), + in_blk_size.unwrap_nonnegative(), + input_grad_accessor_gpu.shape.num_elements().unwrap_nonnegative()); + + // Run CPU Cast Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Reverse::cpu_backward_kernel( + output_grad_accessor_cpu, + input_grad_accessor_cpu, + num_out_blks.unwrap_nonnegative(), + reverse_dim_size.unwrap_nonnegative(), + in_blk_size.unwrap_nonnegative()); + + CHECK(accessor_data_is_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index c9eaa76b86..ecb996227f 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -12,12 +12,15 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int input_w = 100_n; nonnegative_int channels = 100_n; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); TensorShape output_shape = input_shape; SoftmaxPerDeviceState state = @@ -40,30 +43,22 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Softmax::backward_kernel( managed_stream.raw_stream(), - input_grad_accessor.get_float_ptr(), output_grad_accessor.get_float_ptr(), + input_grad_accessor.get_float_ptr(), output_grad_accessor.shape.num_elements().unwrap_nonnegative()); - std::vector expected_input_grad_data = std::vector( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(host_input_grad_data == expected_input_grad_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index ea0d280f68..20a6898896 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/split_kernels.h" +#include "op-attrs/datatype_value.h" #include "test_utils.h" #include "utils/containers/repeat.h" @@ -12,13 +13,17 @@ TEST_SUITE(FF_TEST_SUITE) { coord_t in_blk_size = 100; coord_t num_blks = 1; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n}); + TensorShape input_shape = + make_tensor_shape_from_ff_ordered({100_n}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_ff_ordered({50_n}, DataType::FLOAT); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = @@ -47,8 +52,8 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 0.0f); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(0)); Kernels::Split::backward_kernel(managed_stream.raw_stream(), input_grad_accessor.get_float_ptr(), diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index 02d99c86a1..ac8876ac98 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -12,47 +12,41 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10_n, 10_n}); + make_tensor_shape_from_ff_ordered({10_n, 10_n}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Transpose::forward_kernel( managed_stream.raw_stream(), attrs, input_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = create_random_filled_accessor_w(input_shape, allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), attrs, - input_grad_accessor, - output_grad_accessor); + output_grad_accessor, + input_grad_accessor); - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_grad_input_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index 903b666fa9..e335e5b449 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -1,106 +1,206 @@ #include "test_utils.h" +#include "kernels/datatype_dispatch.h" +#include "op-attrs/tensor_shape.h" +#include "utils/join_strings.h" +#include -GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dist(-1.0f, 1.0f); - - for (auto &val : host_data) { - val = dist(gen); - } +namespace FlexFlow { - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } +TensorShape make_tensor_shape_from_ff_ordered(FFOrdered dims, + DataType DT) { + return TensorShape{ + TensorDims{ + dims, + }, + DT, + }; +} - return accessor; +GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape); + fill_with_zeros(result_accessor); + return result_accessor; } -GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - float val, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume, val); - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); +GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_zero_filled_accessor_w(shape, allocator); + return read_only_accessor_from_write_accessor(accessor); +} + +template +struct CreateRandomFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, + Allocator &allocator) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); + + using T = real_type_t
; + T *data_ptr = src_accessor.get
(); + + std::random_device rd; + std::mt19937 gen(rd()); + size_t num_elements = get_num_elements(shape).unwrap_nonnegative(); + if constexpr (std::is_same::value) { + std::bernoulli_distribution dist(0.5); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } else if constexpr (std::is_floating_point::value) { + std::uniform_real_distribution dist(-1.0, 1.0); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } else if constexpr (std::is_integral::value) { + std::uniform_int_distribution dist(0, 99); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } + + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return dst_accessor; } +}; + +GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, + Allocator &allocator) { + return DataTypeDispatch1{}( + shape.data_type, shape, allocator); +} + +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_random_filled_accessor_w(shape, allocator); - return accessor; + return read_only_accessor_from_write_accessor(accessor); } -GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume); +template +struct FillWithZeros { + void operator()(GenericTensorAccessorW const &accessor) { + using T = real_type_t
; - for (size_t i = 0; i < volume; i++) { - host_data[i] = i; + if (accessor.device_type == DeviceType::CPU) { + memset(accessor.ptr, + 0, + accessor.shape.get_volume().unwrap_nonnegative() * sizeof(T)); + } else { + checkCUDA(cudaMemset(accessor.ptr, + 0, + accessor.shape.get_volume().unwrap_nonnegative() * + sizeof(T))); + } } +}; + +void fill_with_zeros(GenericTensorAccessorW const &accessor) { + DataTypeDispatch1{}(accessor.data_type, accessor); +} + +template +struct CPUAccessorRContainsNonZero { + bool operator()(GenericTensorAccessorR const &accessor) { + using T = real_type_t
; + + T const *data_ptr = accessor.get
(); - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); + int volume = accessor.shape.num_elements().unwrap_nonnegative(); + for (size_t i = 0; i < volume; i++) { + if (data_ptr[i] != 0) { + return true; + } + } + + return false; } +}; - return accessor; +bool contains_non_zero(GenericTensorAccessorR const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); + return DataTypeDispatch1{}( + cpu_accessor.data_type, cpu_accessor); } -void fill_tensor_accessor_w(GenericTensorAccessorW accessor, - float val, - bool cpu_fill) { - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume, val); - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); +template +struct Print2DCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + int const dims = accessor.shape.num_dims(); + int const cols = accessor.shape.at(legion_dim_t{0_n}); + int const rows = (dims == 2) ? accessor.shape.at(legion_dim_t{1_n}) : 1_n; + + auto get_element = [dims, &accessor](int j, int i) { + return (dims == 1) ? accessor.at
({j}) : accessor.at
({j, i}); + }; + + std::vector indices(cols); + std::iota(indices.begin(), indices.end(), 0); + for (int i = 0; i < rows; ++i) { + stream << join_strings(indices, " ", [=](int j) { + return get_element(j, i); + }) << std::endl; + } } +}; + +void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); + DataTypeDispatch1{}( + accessor.data_type, cpu_accessor, stream); } -TensorShape - make_float_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DataType::FLOAT, - }; +template +struct CreateFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + using T = real_type_t
; + if (!val.template has()) { + throw mk_runtime_error("create_filed_accessor expected data type of " + "shape and passed-in value to match"); + } + + auto unwrapped_value = val.get(); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); + + T *data_ptr = src_accessor.get
(); + + int volume = dst_accessor.shape.num_elements().unwrap_nonnegative(); + for (size_t i = 0; i < volume; i++) { + data_ptr[i] = unwrapped_value; + } + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + return dst_accessor; + } +}; + +GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + + return DataTypeDispatch1{}( + shape.data_type, shape, allocator, val); } -TensorShape - make_double_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DataType::DOUBLE, - }; +GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + GenericTensorAccessorW w_accessor = + create_filled_accessor_w(shape, allocator, val); + return read_only_accessor_from_write_accessor(w_accessor); } +} // namespace FlexFlow diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index 08f0f382fb..2e7294ed1d 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -1,59 +1,63 @@ #ifndef _FLEXFLOW_KERNELS_TEST_UTILS #define _FLEXFLOW_KERNELS_TEST_UTILS +#include "kernels/copy_tensor_accessor.h" #include "kernels/device.h" +#include "kernels/local_cpu_allocator.h" #include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" +#include "op-attrs/datatype.h" +#include "op-attrs/datatype_value.dtg.h" #include -#include #include #include #include -using namespace FlexFlow; +namespace FlexFlow { GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill = false); + Allocator &allocator); -GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - float val, - bool cpu_fill = false); +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator); -GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill = false); +GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, + Allocator &allocator); -void fill_tensor_accessor_w(GenericTensorAccessorW accessor, - float val, - bool cpu_fill = false); +GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape, + Allocator &allocator); -TensorShape - make_float_tensor_shape_from_legion_dims(FFOrdered dims); +TensorShape make_tensor_shape_from_ff_ordered(FFOrdered dims, + DataType DT); -TensorShape - make_double_tensor_shape_from_legion_dims(FFOrdered dims); +bool contains_non_zero(GenericTensorAccessorR const &accessor); -template -std::vector load_data_to_host_from_device(GenericTensorAccessorR accessor) { - int volume = accessor.shape.get_volume(); +void fill_with_zeros(GenericTensorAccessorW const &accessor); - std::vector local_data(volume); - checkCUDA(cudaMemcpy(local_data.data(), - accessor.ptr, - local_data.size() * sizeof(T), - cudaMemcpyDeviceToHost)); - return local_data; -} +void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor, + std::ostream &stream); + +GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val); + +GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val); -template -bool contains_non_zero(std::vector &data) { - return !all_of( - data.begin(), data.end(), [](T const &val) { return val == 0; }); +template +std::vector repeat(std::size_t n, Func &&func) { + std::vector result; + // result.reserve(n); // Sometimes we don't have default constructor for T + for (std::size_t i = 0; i < n; ++i) { + result.push_back(func()); + } + return result; } +} // namespace FlexFlow + // Specialize doctest's StringMaker for std::vector template <> struct doctest::StringMaker> { diff --git a/lib/local-execution/include/local-execution/local_cpu_allocator.h b/lib/local-execution/include/local-execution/local_cpu_allocator.h index d1e81facf2..cf6cfe35d1 100644 --- a/lib/local-execution/include/local-execution/local_cpu_allocator.h +++ b/lib/local-execution/include/local-execution/local_cpu_allocator.h @@ -12,6 +12,8 @@ struct LocalCPUAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: std::unordered_map> ptrs; }; diff --git a/lib/local-execution/include/local-execution/per_device_op_state.h b/lib/local-execution/include/local-execution/per_device_op_state.h index 1edd5b6360..f1f357a86e 100644 --- a/lib/local-execution/include/local-execution/per_device_op_state.h +++ b/lib/local-execution/include/local-execution/per_device_op_state.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H #define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H +#include "kernels/per_device_op_state.dtg.h" #include "local-execution/device_specific_device_states.dtg.h" -#include "local-execution/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 54c8dfc5f1..48584588e3 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H +#include "kernels/per_device_op_state.dtg.h" #include "local-execution/device_specific.h" #include "local-execution/itask_argument_accessor.h" -#include "local-execution/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h index 731e04fdc8..f697337c52 100644 --- a/lib/local-execution/include/local-execution/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -13,6 +13,9 @@ struct TrackedAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + + DeviceType get_allocation_device_type() const override; + size_t get_current_mem_usage(); private: diff --git a/lib/local-execution/src/local_cpu_allocator.cc b/lib/local-execution/src/local_cpu_allocator.cc index 4ca5f987a8..c4657e26b5 100644 --- a/lib/local-execution/src/local_cpu_allocator.cc +++ b/lib/local-execution/src/local_cpu_allocator.cc @@ -17,6 +17,10 @@ void LocalCPUAllocator::deallocate(void *ptr) { } } +DeviceType LocalCPUAllocator::get_allocation_device_type() const { + return DeviceType::CPU; +} + Allocator create_local_cpu_memory_allocator() { return Allocator::create(); } diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc index 54eca7e514..5d099c6b46 100644 --- a/lib/local-execution/src/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local_task_argument_accessor.cc @@ -24,8 +24,8 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( auto tensor_backing = std::get( this->tensor_slots_backing.at(slot_grad_pair)); if (priv == Permissions::RO) { - GenericTensorAccessorR readonly_tensor_backing = { - tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}; + GenericTensorAccessorR readonly_tensor_backing = + read_only_accessor_from_write_accessor(tensor_backing); return readonly_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { return tensor_backing; @@ -33,6 +33,7 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv)); } } + VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( slot_id_t slot, Permissions priv, IsGrad is_grad) const { SlotGradId slot_grad_pair = SlotGradId{slot, is_grad}; @@ -43,7 +44,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( for (GenericTensorAccessorW const &tensor_backing : variadic_tensor_backing) { readonly_variadic_tensor_backing.push_back( - {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}); + read_only_accessor_from_write_accessor(tensor_backing)); } return readonly_variadic_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc index 1df6da8d8e..5cf8742918 100644 --- a/lib/local-execution/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -134,9 +134,9 @@ static std::optional profiling, "[BatchNorm] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - output_grad.get_float_ptr(), output.get_float_ptr(), + output_grad.get_float_ptr(), + input.get_float_ptr(), input_grad.get_float_ptr(), scale.get_float_ptr(), scale_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc index 3e7baf49a9..e9adf88422 100644 --- a/lib/local-execution/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -54,9 +54,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { profiling, "[Cast] forward_time = {:.2lf}ms\n", input, - output, - input.data_type, - attrs.dtype); + output); } static std::optional @@ -73,9 +71,7 @@ static std::optional profiling, "[Cast] forward_time = {:.2lf}ms\n", input_grad, - output_grad, - input.data_type, - attrs.dtype); + output_grad); } TaskImplFunction get_cast_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc index bb1504a3f5..55ff354483 100644 --- a/lib/local-execution/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -107,8 +107,8 @@ static std::optional acc.get_argument(PER_DEVICE_STATE); auto attrs = acc.get_argument(ATTRS); - auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); auto filter = acc.get_tensor(FILTER); auto input_grad = acc.get_tensor_grad(INPUT); @@ -120,10 +120,10 @@ static std::optional profiling, "[Conv2d] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), filter.get_float_ptr(), filter_grad.get_float_ptr(), bias_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index c5ff9199f3..fe6201ec98 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -88,10 +88,10 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor_grad(INPUT); auto const &attrs = acc.get_argument(ATTRS); auto handle = acc.get_argument(HANDLE); @@ -106,10 +106,10 @@ static std::optional per_device_state, attrs, handle, - input, - input_grad, output, - output_grad); + output_grad, + input, + input_grad); } TaskImplFunction get_element_unary_init_task_impl() { diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc index 0f872b5d50..af6fc16272 100644 --- a/lib/local-execution/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -40,15 +40,15 @@ static std::optional ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); return profile(backward_kernel, profiling, "[Flat] backward_time = {:.2lf}ms\n", input, - input_grad.get_float_ptr(), - output_grad.get_float_ptr()); + output_grad.get_float_ptr(), + input_grad.get_float_ptr()); } TaskImplFunction get_flat_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 6f0901e66a..bde5579b56 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -125,17 +125,17 @@ static std::optional auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); - auto bias = acc.get_tensor(BIAS); + auto bias = acc.get_tensor(BIAS); auto input_grad = acc.get_tensor_grad(INPUT); auto weight_grad = acc.get_tensor_grad(WEIGHT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); - float const *bias_ptr = NULL; + float *bias_ptr = NULL; if (attrs.use_bias) { bias_ptr = bias.get_float_ptr(); } @@ -148,13 +148,13 @@ static std::optional profiling, "[Linear] backward_time = {:.2lf}ms\n", per_device_state, - (void *)input.get_float_ptr(), - (void *)input_grad.get_float_ptr(), - (void *)output.get_float_ptr(), - (void *)output_grad.get_float_ptr(), - (void *)weight.get_float_ptr(), - (void *)weight_grad.get_float_ptr(), - (void *)bias_ptr, + output.get_float_ptr(), + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + bias_ptr, in_dim.unwrap_nonnegative(), out_dim.unwrap_nonnegative(), batch_size.unwrap_nonnegative()); diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc index fb0635efba..f85874dc0a 100644 --- a/lib/local-execution/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -115,19 +115,19 @@ static std::optional Pool2DPerDeviceState state = acc.get_argument(PER_DEVICE_STATE); - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor(INPUT); return profile(backward_kernel, profiling, "[Pool2D] backward_time = {:.2lf}ms\n", state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), - output_grad.get_float_ptr()); + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr()); } TaskImplFunction get_pool_2d_init_task_impl() { diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc index ee1a7c6c4e..b07d9fe965 100644 --- a/lib/local-execution/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -63,13 +63,13 @@ static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); return profile(backward_kernel, profiling, "[Reduction] backward_time = {:.2lf}ms\n", - input_grad, - output_grad); + output_grad, + input_grad); } TaskImplFunction get_reduction_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc index 6c0c813c8d..7b6e9fe2f6 100644 --- a/lib/local-execution/src/ops/repartition.cc +++ b/lib/local-execution/src/ops/repartition.cc @@ -85,8 +85,8 @@ static std::optional ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(INPUT); + auto input_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc index d3ada35d93..99aeb913ba 100644 --- a/lib/local-execution/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -66,8 +66,8 @@ static std::optional return profile(backward_kernel, profiling, "[replicate] backward_time = {:.2lf}ms\n", - input_grad, output_grad, + input_grad, attrs.replicate_degree.unwrap_nonnegative()); } diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc index fc3a75607d..e382b2668e 100644 --- a/lib/local-execution/src/ops/reshape.cc +++ b/lib/local-execution/src/ops/reshape.cc @@ -86,8 +86,8 @@ static std::optional profiling, "[Reshape] backward time = {:.2lf}ms\n", per_device_state, - input_grad, - output_grad); + output_grad, + input_grad); } TaskImplFunction get_reshape_init_task_impl() { diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc index 0e94422c5f..e008098e05 100644 --- a/lib/local-execution/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -106,8 +106,8 @@ static std::optional return profile(backward_kernel, profiling, "[SoftMax] backward_time = {:.2lf}ms\n", - input_grad.get_float_ptr(), output_grad.get_float_ptr(), + input_grad.get_float_ptr(), output_grad.shape.get_volume().unwrap_nonnegative()); } diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc index 4146836b9a..1859bb0ccc 100644 --- a/lib/local-execution/src/ops/transpose.cc +++ b/lib/local-execution/src/ops/transpose.cc @@ -67,8 +67,8 @@ static std::optional profiling, "[Transpose] Backward_time = {:.2lf} [ms]", attrs, - input_grad, - output_grad); + output_grad, + input_grad); } OpTaskInvocation backward(TransposeAttrs const &attrs) { diff --git a/lib/local-execution/src/per_device_state.cc b/lib/local-execution/src/per_device_op_state.cc similarity index 100% rename from lib/local-execution/src/per_device_state.cc rename to lib/local-execution/src/per_device_op_state.cc diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index e6c3a11711..ed181aea32 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -23,8 +23,13 @@ size_t TrackedAllocator::get_current_mem_usage() { return this->current_mem_usage; } +DeviceType TrackedAllocator::get_allocation_device_type() const { + return this->allocator.get_allocation_device_type(); +} + Allocator get_tracked_memory_allocator(Allocator const &base_allocator) { - return Allocator::create(base_allocator); + Allocator allocator = Allocator::create(base_allocator); + return allocator; } } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index da3af6e3ad..9f8b4092c1 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -12,68 +12,71 @@ // TEST_SUITE(FF_CUDA_TEST_SUITE) { // TEST_CASE("Local Cost Estimator") { // // local backing initialization -// ManagedPerDeviceFFHandle managed_handle{}; +// ManagedPerDeviceFFHandle managed_handle{ +// /*workSpaceSize=*/1024 * 1024, +// /*allowTensorOpMathConversion=*/true}; -// RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ -// DeviceSpecific::create(managed_handle.raw_handle()), -// EnableProfiling::YES, -// ProfilingSettings{/*warmup_iters=*/0, -// /*measure_iters=*/1}}; +// RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ +// DeviceSpecific::create(managed_handle.raw_handle()), +// EnableProfiling::YES, +// ProfilingSettings{/*warmup_iters=*/0, +// /*measure_iters=*/1}}; -// LocalCostEstimator cost_estimator = -// LocalCostEstimator{runtime_arg_config}; +// LocalCostEstimator cost_estimator = +// LocalCostEstimator{runtime_arg_config}; -// SUBCASE("Estimate cost -- Attention Op") { -// int embed_dim = 32; -// int num_heads = 10; -// MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ -// /*embed_dim=*/embed_dim, -// /*num_heads=*/num_heads, -// /*kdim=*/embed_dim, -// /*vdim=*/embed_dim, -// /*dropout=*/0.0, -// /*bias=*/true, -// /*add_bias_kv=*/false, -// /*add_zero_attn=*/false, -// }; +// SUBCASE("Estimate cost -- Attention Op") { +// int embed_dim = 32; +// int num_heads = 10; +// MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ +// /*embed_dim=*/embed_dim, +// /*num_heads=*/num_heads, +// /*kdim=*/embed_dim, +// /*vdim=*/embed_dim, +// /*dropout=*/0.0, +// /*bias=*/true, +// /*add_bias_kv=*/false, +// /*add_zero_attn=*/false, +// }; -// size_t batch_size = 40; -// size_t seq_len = 48; -// size_t feature_size = 36; +// size_t batch_size = 40; +// size_t seq_len = 48; +// size_t feature_size = 36; -// DataType dtype = DataType::FLOAT; -// ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ -// TensorDims{FFOrdered{batch_size, seq_len, feature_size}}, -// DataType::FLOAT, -// }); +// DataType dtype = DataType::FLOAT; +// ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ +// TensorDims{FFOrdered{batch_size, seq_len, +// feature_size}}, DataType::FLOAT, +// }); -// ParallelTensorShape weights_shape = throw_if_unexpected( -// get_weights_shape(attrs, inputs_shape, inputs_shape, -// inputs_shape)); -// ParallelTensorAttrs weight_attrs = -// ParallelTensorAttrs{weights_shape, -// /*sync_type=*/std::nullopt, -// /*initializer=*/std::nullopt, -// CreateGrad::YES}; +// ParallelTensorShape weights_shape = throw_if_unexpected( +// get_weights_shape(attrs, inputs_shape, inputs_shape, +// inputs_shape)); +// ParallelTensorAttrs weight_attrs = +// ParallelTensorAttrs{weights_shape, +// /*sync_type=*/std::nullopt, +// /*initializer=*/std::nullopt, +// CreateGrad::YES}; -// ParallelTensorShape output_shape = throw_if_unexpected( -// get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); -// ParallelTensorAttrs output_attrs = -// ParallelTensorAttrs{output_shape, -// /*sync_type=*/std::nullopt, -// /*initializer=*/std::nullopt, -// CreateGrad::YES}; +// ParallelTensorShape output_shape = throw_if_unexpected( +// get_output_shape(attrs, inputs_shape, inputs_shape, +// inputs_shape)); +// ParallelTensorAttrs output_attrs = +// ParallelTensorAttrs{output_shape, +// /*sync_type=*/std::nullopt, +// /*initializer=*/std::nullopt, +// CreateGrad::YES}; -// CostDetails result = cost_estimator.estimate_cost( -// PCGOperatorAttrs{attrs}, -// std::vector{ -// inputs_shape, inputs_shape, inputs_shape}, -// std::vector{weight_attrs}, -// std::vector{output_attrs}, -// make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1})); +// CostDetails result = cost_estimator.estimate_cost( +// PCGOperatorAttrs{attrs}, +// std::vector{ +// inputs_shape, inputs_shape, inputs_shape}, +// std::vector{weight_attrs}, +// std::vector{output_attrs}, +// make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1})); -// CHECK(result.total_elapsed_time > 0); -// CHECK(result.total_mem_usage > 0); +// CHECK(result.total_elapsed_time > 0); +// CHECK(result.total_mem_usage > 0); +// } +// } // } -// } -// } diff --git a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml index 27aa50f38f..09ee99915d 100644 --- a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml +++ b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml @@ -10,5 +10,6 @@ features = [ [[values]] name = "SUM" -[[value]] +[[values]] name = "AVG" + diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h new file mode 100644 index 0000000000..723e69bddd --- /dev/null +++ b/lib/op-attrs/include/op-attrs/datatype_value.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H + +#include "op-attrs/datatype_value.dtg.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value); +DataTypeValue make_double_data_type_value(double value); +DataTypeValue make_int32_data_type_value(int32_t value); +DataTypeValue make_int64_data_type_value(int64_t value); +DataTypeValue make_bool_data_type_value(bool value); + +} // namespace FlexFlow + +#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H diff --git a/lib/op-attrs/src/op-attrs/datatype_value.cc b/lib/op-attrs/src/op-attrs/datatype_value.cc new file mode 100644 index 0000000000..4604ef0b4e --- /dev/null +++ b/lib/op-attrs/src/op-attrs/datatype_value.cc @@ -0,0 +1,25 @@ +#include "op-attrs/datatype_value.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value) { + return DataTypeValue{value}; +} + +DataTypeValue make_double_data_type_value(double value) { + return DataTypeValue{value}; +} + +DataTypeValue make_int32_data_type_value(int32_t value) { + return DataTypeValue{value}; +} + +DataTypeValue make_int64_data_type_value(int64_t value) { + return DataTypeValue{value}; +} + +DataTypeValue make_bool_data_type_value(bool value) { + return DataTypeValue{value}; +} + +} // namespace FlexFlow diff --git a/lib/op-attrs/test/src/op-attrs/datatype_value.cc b/lib/op-attrs/test/src/op-attrs/datatype_value.cc new file mode 100644 index 0000000000..9b0e90b601 --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/datatype_value.cc @@ -0,0 +1,68 @@ +#include "op-attrs/datatype_value.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("test make_data_type_value") { + SUBCASE("make_float_data_type_value") { + float value = 1.0f; + DataTypeValue data_type_value = make_float_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_double_data_type_value") { + double value = 2.71828; + DataTypeValue data_type_value = make_double_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_int32_data_type_value") { + int32_t value = -42; + DataTypeValue data_type_value = make_int32_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_int64_data_type_value") { + int64_t value = 1LL << 40; + DataTypeValue data_type_value = make_int64_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_bool_data_type_value") { + bool value = true; + DataTypeValue data_type_value = make_bool_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + } +} diff --git a/lib/pcg/include/pcg/metric.enum.toml b/lib/pcg/include/pcg/metric.enum.toml new file mode 100644 index 0000000000..ebb2323203 --- /dev/null +++ b/lib/pcg/include/pcg/metric.enum.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "Metric" +features = [ + "hash", + "json", + "rapidcheck", + "fmt", +] + +[[values]] +name = "ACCURACY" + +[[values]] +name = "CATEGORICAL_CROSSENTROPY" + +[[values]] +name = "SPARSE_CATEGORICAL_CROSSENTROPY" + +[[values]] +name = "MEAN_SQUARED_ERROR" + +[[values]] +name = "ROOT_MEAN_SQUARED_ERROR" + +[[values]] +name = "MEAN_ABSOLUTE_ERROR" diff --git a/lib/pcg/include/pcg/metric_attrs.h b/lib/pcg/include/pcg/metric_attrs.h new file mode 100644 index 0000000000..343c2154dd --- /dev/null +++ b/lib/pcg/include/pcg/metric_attrs.h @@ -0,0 +1,28 @@ +#ifndef _FF_METRICS_H_ +#define _FF_METRICS_H_ + +#include "op-attrs/ops/loss_functions/loss_functions.h" +#include "pcg/metric.dtg.h" +#include "utils/fmt.h" +#include + +namespace FlexFlow { + +class MetricsAttrs { +public: + MetricsAttrs() = delete; + MetricsAttrs(LossFunction, std::unordered_set const &); + +public: + LossFunction loss_type; + bool measure_accuracy; + bool measure_categorical_crossentropy; + bool measure_sparse_categorical_crossentropy; + bool measure_mean_squared_error; + bool measure_root_mean_squared_error; + bool measure_mean_absolute_error; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc index 267f05499c..f0bdeb9cde 100644 --- a/lib/pcg/src/pcg/computation_graph_builder.cc +++ b/lib/pcg/src/pcg/computation_graph_builder.cc @@ -1,5 +1,6 @@ #include "pcg/computation_graph_builder.h" #include "op-attrs/computation_graph_op_attrs.h" +#include "op-attrs/datatype_value.h" #include "op-attrs/get_incoming_tensor_roles.h" #include "op-attrs/get_op_type.h" #include "op-attrs/ops/attention.h" diff --git a/lib/pcg/src/pcg/metric_attrs.cc b/lib/pcg/src/pcg/metric_attrs.cc new file mode 100644 index 0000000000..9a93e75350 --- /dev/null +++ b/lib/pcg/src/pcg/metric_attrs.cc @@ -0,0 +1,38 @@ +#include "pcg/metric_attrs.h" + +namespace FlexFlow { +MetricsAttrs::MetricsAttrs(LossFunction _loss_type, + std::unordered_set const &metrics) + : loss_type(_loss_type), measure_accuracy(false), + measure_categorical_crossentropy(false), + measure_sparse_categorical_crossentropy(false), + measure_mean_squared_error(false), measure_root_mean_squared_error(false), + measure_mean_absolute_error(false) { + for (Metric const &m : metrics) { + switch (m) { + case Metric::ACCURACY: + measure_accuracy = true; + continue; + case Metric::CATEGORICAL_CROSSENTROPY: + measure_categorical_crossentropy = true; + continue; + case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: + measure_sparse_categorical_crossentropy = true; + continue; + case Metric::MEAN_SQUARED_ERROR: + measure_mean_squared_error = true; + continue; + case Metric::ROOT_MEAN_SQUARED_ERROR: + measure_root_mean_squared_error = true; + continue; + case Metric::MEAN_ABSOLUTE_ERROR: + measure_mean_absolute_error = true; + continue; + default: + throw mk_runtime_error(fmt::format( + "Initializing MetricsAttrs with unrecogonized metrics type {}", m)); + } + } +} + +} // namespace FlexFlow diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc index 4e72b2fe0f..900f3cc5ef 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc @@ -1,4 +1,5 @@ #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" +#include "op-attrs/datatype_value.h" #include "op-attrs/get_incoming_tensor_roles.h" #include "op-attrs/ops/attention.h" #include "op-attrs/ops/attention_attrs.dtg.h" diff --git a/lib/runtime/src/metrics_functions.cc b/lib/runtime/src/metrics_functions.cc index feb6e704b2..33e15baed2 100644 --- a/lib/runtime/src/metrics_functions.cc +++ b/lib/runtime/src/metrics_functions.cc @@ -25,39 +25,6 @@ namespace FlexFlow { LegionRuntime::Logger::Category log_metrics("metrics"); -MetricsAttrs::MetricsAttrs(LossFunction _loss_type, - std::vector const &metrics) - : loss_type(_loss_type), measure_accuracy(false), - measure_categorical_crossentropy(false), - measure_sparse_categorical_crossentropy(false), - measure_mean_squared_error(false), measure_root_mean_squared_error(false), - measure_mean_absolute_error(false) { - for (Metric const &m : metrics) { - switch (m) { - case Metric::ACCURACY: - measure_accuracy = true; - continue; - case Metric::CATEGORICAL_CROSSENTROPY: - measure_categorical_crossentropy = true; - continue; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - measure_sparse_categorical_crossentropy = true; - continue; - case Metric::MEAN_SQUARED_ERROR: - measure_mean_squared_error = true; - continue; - case Metric::ROOT_MEAN_SQUARED_ERROR: - measure_root_mean_squared_error = true; - continue; - case Metric::MEAN_ABSOLUTE_ERROR: - measure_mean_absolute_error = true; - continue; - default: - throw mk_runtime_error("Unrecogonized metrics type {}", m); - } - } -} - enum Slots { LOGIT, LABEL, diff --git a/lib/runtime/src/metrics_functions.h b/lib/runtime/src/metrics_functions.h index fbb0b633bf..73dc3bbc51 100644 --- a/lib/runtime/src/metrics_functions.h +++ b/lib/runtime/src/metrics_functions.h @@ -16,38 +16,13 @@ #ifndef _FF_METRICS_FUNCTIONS_H_ #define _FF_METRICS_FUNCTIONS_H_ +#include "kernels/metric.h" #include "kernels/perf_metrics.h" #include "legion.h" -#include "op-attrs/ops/loss_functions.h" #include "task_spec/task_invocation.h" -#include "utils/fmt.h" namespace FlexFlow { -enum class Metric { - ACCURACY, - CATEGORICAL_CROSSENTROPY, - SPARSE_CATEGORICAL_CROSSENTROPY, - MEAN_SQUARED_ERROR, - ROOT_MEAN_SQUARED_ERROR, - MEAN_ABSOLUTE_ERROR, -}; - -class MetricsAttrs { -public: - MetricsAttrs() = delete; - MetricsAttrs(LossFunction, std::vector const &); - -public: - LossFunction loss_type; - bool measure_accuracy; - bool measure_categorical_crossentropy; - bool measure_sparse_categorical_crossentropy; - bool measure_mean_squared_error; - bool measure_root_mean_squared_error; - bool measure_mean_absolute_error; -}; - TypedIndexTaskInvocation compute_metrics(MetricsAttrs const &, parallel_tensor_guid_t const &logit, @@ -79,40 +54,4 @@ VISITABLE_STRUCT(::FlexFlow::MetricsAttrs, measure_root_mean_squared_error, measure_mean_absolute_error); -namespace fmt { - -template <> -struct formatter<::FlexFlow::Metric> : formatter { - template - auto format(::FlexFlow::Metric m, FormatContext &ctx) const - -> decltype(ctx.out()) { - using namespace FlexFlow; - - string_view name = "unknown"; - switch (m) { - case Metric::ACCURACY: - name = "Accuracy"; - break; - case Metric::CATEGORICAL_CROSSENTROPY: - name = "CategoricalCrossEntropy"; - break; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - name = "SparseCategoricalCrossEntropy"; - break; - case Metric::MEAN_SQUARED_ERROR: - name = "MeanSquaredError"; - break; - case Metric::ROOT_MEAN_SQUARED_ERROR: - name = "RootMeanSquaredError"; - break; - case Metric::MEAN_ABSOLUTE_ERROR: - name = "MeanAbsoluteError"; - break; - } - return formatter::format(name, ctx); - } -}; - -} // namespace fmt - #endif diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc index 253fd3cb4f..83e7c15460 100644 --- a/lib/runtime/src/ops/embedding.cc +++ b/lib/runtime/src/ops/embedding.cc @@ -77,11 +77,11 @@ static std::optional return profile(backward_kernel, profiling, "[Embedding] backward_time = {:.2lf}ms\n", - input, output, + input, weight_grad, - input.data_type, output.data_type, + input.data_type, attrs.aggr, input.shape.get_dim(), output.shape.get_dim(), diff --git a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h index a266ddea77..b7fb27ffd9 100644 --- a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h +++ b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h @@ -43,6 +43,9 @@ class nonnegative_int { nonnegative_int operator++(int); nonnegative_int &operator+=(nonnegative_int const &other); + nonnegative_int operator-(nonnegative_int const &other) const; + nonnegative_int &operator-=(nonnegative_int const &other); + nonnegative_int operator*(nonnegative_int const &other) const; nonnegative_int &operator*=(nonnegative_int const &other); diff --git a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc index 3472a7eee2..37a5934c52 100644 --- a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc +++ b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc @@ -101,6 +101,15 @@ nonnegative_int &nonnegative_int::operator+=(nonnegative_int const &other) { return *this; } +nonnegative_int nonnegative_int::operator-(nonnegative_int const &other) const { + return nonnegative_int{this->value_ - other.value_}; +} + +nonnegative_int &nonnegative_int::operator-=(nonnegative_int const &other) { + *this = nonnegative_int{this->value_ - other.value_}; + return *this; +} + nonnegative_int nonnegative_int::operator*(nonnegative_int const &other) const { return nonnegative_int{this->value_ * other.value_}; }