Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
c0bf108
test_utils refactor, local_cpu_allocator
dylanllim Jul 11, 2024
47ad0d8
test utils modification, cast, reverse, and replicate cpu kernels
dylanllim Jul 12, 2024
921fe65
combine kernel
dylanllim Jul 14, 2024
4ca67aa
combine kernels .h file
dylanllim Jul 14, 2024
86edf2e
Implementations for methods for machine_views and associated modules …
Marsella8 Jul 19, 2024
d9af610
test utils logic cleanup, reverse cpu_kernel pedagogical implmentatio…
dylanllim Jul 31, 2024
64034a5
cpu_kernel's refactor, generic tensor accessor indexing
dylanllim Oct 8, 2024
0304f17
accessor.h formatting
dylanllim Oct 8, 2024
7c3ff87
mk_runtime_error formatting
dylanllim Oct 8, 2024
65d7804
reverse_kernels include
dylanllim Oct 8, 2024
7c5fb1f
test_utils refactor and clarity
dylanllim Oct 15, 2024
8188afe
formatting
dylanllim Oct 15, 2024
a13255b
comment removal reverse_kernels
dylanllim Oct 15, 2024
7ed5624
Issue #1435, tests for managed stream and handle
dylanllim Oct 16, 2024
c1758c0
#1435 formatting
dylanllim Oct 16, 2024
54b3888
#1409 issue, change datatype for linear kernels away from void *
dylanllim Oct 16, 2024
5b5c2f6
R & W accessor changes, minimize code bloat
dylanllim Nov 5, 2024
ddae367
code formatting and refactor
dylanllim Nov 16, 2024
507df4a
issue #1502 & issue #1540
dylanllim Nov 22, 2024
c64a55c
format check
dylanllim Nov 22, 2024
a091652
branch merge and test fixes
dylanllim Jan 28, 2025
f19df3a
Merge remote-tracking branch 'origin/master' into cpu-kernels-tests-v2
dylanllim Jan 29, 2025
8860adf
build issues
dylanllim Jan 29, 2025
7b74acc
Add AWS linux AMI to runs-on for testing (#1589)
lockshaw Jan 30, 2025
8cdc677
Pin runs-on images (#1590)
lockshaw Jan 30, 2025
209db7e
GPU CI Fix (Pin runs-on GPU image) (#1588)
lockshaw Jan 31, 2025
0d2ffdb
Merge substitution-builder (#1575)
victorli2002 Feb 1, 2025
fe339eb
test_utils refactor, local_cpu_allocator
dylanllim Jul 11, 2024
2e2ae13
test utils modification, cast, reverse, and replicate cpu kernels
dylanllim Jul 12, 2024
6c30466
combine kernel
dylanllim Jul 14, 2024
5b5c591
test utils logic cleanup, reverse cpu_kernel pedagogical implmentatio…
dylanllim Jul 31, 2024
f0432c3
cpu_kernel's refactor, generic tensor accessor indexing
dylanllim Oct 8, 2024
74d186d
test_utils refactor and clarity
dylanllim Oct 15, 2024
f95d9da
R & W accessor changes, minimize code bloat
dylanllim Nov 5, 2024
8c8bc75
issue #1502 & issue #1540
dylanllim Nov 22, 2024
c00ab84
branch merge and test fixes
dylanllim Jan 28, 2025
bc4b659
merge
dylanllim Feb 5, 2025
3146712
Merge remote-tracking branch 'origin/master' into cpu-kernels-tests
dylanllim Feb 5, 2025
e71b6d7
build after merge
dylanllim Feb 5, 2025
311caf8
kernel issues
dylanllim Feb 8, 2025
157407d
managed stream / handle test case fix
dylanllim Feb 10, 2025
338fc8d
Merge remote-tracking branch 'origin/master' into cpu-kernels-tests
dylanllim Feb 10, 2025
f73e7a1
accessor, array_shape, copy_tensor_accessor, datatype_dispatch, alloc…
dylanllim Feb 25, 2025
4fc0475
remove . files
dylanllim Feb 25, 2025
8b72dcd
format issues
dylanllim Feb 25, 2025
2914494
merge w/ master
dylanllim Feb 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions lib/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ file(GLOB_RECURSE SRC
CONFIGURE_DEPENDS
LIST_DIRECTORIES False
src/*.cc
src/cuda/cuda_helper.cu
src/cuda/ops/*.cu
src/cuda/*.cu
)

add_library(
Expand All @@ -30,6 +29,7 @@ target_link_libraries(
cudnn
nccl
utils
pcg
)

define_ff_vars(${project_target})
Expand Down
224 changes: 172 additions & 52 deletions lib/kernels/include/kernels/accessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,102 @@
#include "device.h"
#include "kernels/ff_handle.h"
#include "op-attrs/datatype.h"
#include "pcg/device_type.dtg.h"
#include "utils/exception.h"
#include "utils/required.h"

namespace FlexFlow {

inline int calculate_accessor_offset(std::vector<int> const &indices,

Check warning on line 14 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L14

Added line #L14 was not covered by tests
ArrayShape const &shape) {
int offset = 0;
int multiplier = 1;

Check warning on line 17 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L16-L17

Added lines #L16 - L17 were not covered by tests

for (int i = 0; i < shape.num_dims(); i++) {
if (indices.at(i) >= shape.at(legion_dim_t{nonnegative_int{i}})) {
throw mk_runtime_error(
fmt::format("In {} dimension, attempting to access index {} "

Check warning on line 22 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L19-L22

Added lines #L19 - L22 were not covered by tests
"when only {} indexes exist",
i,
indices.at(i),
shape.at(legion_dim_t{nonnegative_int{i}})));

Check warning on line 26 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L25-L26

Added lines #L25 - L26 were not covered by tests
}

offset += indices.at(i) * multiplier;
multiplier *=
shape.at(legion_dim_t{nonnegative_int{i}}).unwrap_nonnegative();

Check warning on line 31 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L29-L31

Added lines #L29 - L31 were not covered by tests
}

return offset;

Check warning on line 34 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L34

Added line #L34 was not covered by tests
}

class GenericTensorAccessorR {
public:
template <DataType DT>
typename data_type_enum_to_class<DT>::type const *get() const {
if (this->data_type == DT) {
return static_cast<real_type_t<DT> const *>(this->ptr);
} else {
throw mk_runtime_error(fmt::format(
"Invalid access data type ({} != {})", this->data_type, DT));

Check warning on line 45 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L44-L45

Added lines #L44 - L45 were not covered by tests
}
}

int32_t const *get_int32_ptr() const;
int64_t const *get_int64_ptr() const;
float const *get_float_ptr() const;
double const *get_double_ptr() const;
half const *get_half_ptr() const;

GenericTensorAccessorR() = delete;

GenericTensorAccessorR(DataType data_type,
ArrayShape const &shape,
void const *ptr,
DeviceType device_type);

bool operator==(GenericTensorAccessorR const &) const;
bool operator!=(GenericTensorAccessorR const &) const;

template <DataType DT>
real_type_t<DT> const &at(std::vector<int> const &indices) const {
if (this->device_type != DeviceType::CPU) {
throw mk_runtime_error("Calling at() on non-CPU allocated tensor");

Check warning on line 68 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L66-L68

Added lines #L66 - L68 were not covered by tests
}
if (this->data_type != DT) {
throw mk_runtime_error(fmt::format(
"Invalid access data type ({} != {})", this->data_type, DT));

Check warning on line 72 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L70-L72

Added lines #L70 - L72 were not covered by tests
}
if (indices.size() != this->shape.num_dims()) {
throw mk_runtime_error(fmt::format("Number of indices ({}) does not "

Check warning on line 75 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L74-L75

Added lines #L74 - L75 were not covered by tests
"match the number of dimensions ({}).",
indices.size(),
this->shape.num_dims()));

Check warning on line 78 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L77-L78

Added lines #L77 - L78 were not covered by tests
}

using T = real_type_t<DT>;
T const *data_ptr = static_cast<T const *>(this->ptr);
int offset = calculate_accessor_offset(indices, this->shape);
return data_ptr[offset];

Check warning on line 84 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L82-L84

Added lines #L82 - L84 were not covered by tests
}

public:
DataType data_type;
ArrayShape shape;
void const *ptr;
DeviceType device_type;

private:
std::tuple<decltype(data_type) const &,
decltype(shape) const &,
decltype(ptr) const &,
decltype(device_type) const &>
tie() const;
};

std::string format_as(GenericTensorAccessorR const &);
std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);

class GenericTensorAccessorW {
public:
template <DataType DT>
Expand All @@ -28,64 +119,78 @@
double *get_double_ptr() const;
half *get_half_ptr() const;

public:
DataType data_type;
ArrayShape shape;
req<void *> ptr;
};
FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW,
data_type,
shape,
ptr);
GenericTensorAccessorW() = delete;

std::string format_as(GenericTensorAccessorW const &);
std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
GenericTensorAccessorW(DataType data_type,
ArrayShape const &shape,
void *ptr,
DeviceType device_type);

bool operator==(GenericTensorAccessorW const &) const;
bool operator!=(GenericTensorAccessorW const &) const;

operator GenericTensorAccessorR() const;

class GenericTensorAccessorR {
public:
template <DataType DT>
typename data_type_enum_to_class<DT>::type const *get() const {
if (this->data_type == DT) {
return static_cast<real_type_t<DT> const *>(this->ptr);
} else {
real_type_t<DT> &at(std::vector<int> const &indices) {
if (this->device_type != DeviceType::CPU) {
throw mk_runtime_error("Calling at() on non-CPU allocated tensor");

Check warning on line 137 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L135-L137

Added lines #L135 - L137 were not covered by tests
}
if (this->data_type != DT) {

Check warning on line 139 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L139

Added line #L139 was not covered by tests
throw mk_runtime_error(fmt::format(
"Invalid access data type ({} != {})", this->data_type, DT));
}
if (indices.size() != this->shape.num_dims()) {
throw mk_runtime_error(fmt::format("Number of indices ({}) does not "

Check warning on line 144 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L143-L144

Added lines #L143 - L144 were not covered by tests
"match the number of dimensions ({}).",
indices.size(),
this->shape.num_dims()));

Check warning on line 147 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L146-L147

Added lines #L146 - L147 were not covered by tests
}

using T = real_type_t<DT>;
T *data_ptr = static_cast<T *>(this->ptr);
int offset = calculate_accessor_offset(indices, this->shape);
return data_ptr[offset];

Check warning on line 153 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L151-L153

Added lines #L151 - L153 were not covered by tests
}

int32_t const *get_int32_ptr() const;
int64_t const *get_int64_ptr() const;
float const *get_float_ptr() const;
double const *get_double_ptr() const;
half const *get_half_ptr() const;
template <DataType DT>
real_type_t<DT> &at(std::vector<int> const &indices) const {
if (this->device_type != DeviceType::CPU) {
throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
}
if (this->data_type != DT) {
throw mk_runtime_error(fmt::format(
"Invalid access data type ({} != {})", this->data_type, DT));
}
if (indices.size() != this->shape.num_dims()) {
throw mk_runtime_error(fmt::format("Number of indices ({}) does not "
"match the number of dimensions ({}).",
indices.size(),
this->shape.num_dims()));
}

using T = real_type_t<DT>;
T const *data_ptr = static_cast<T const *>(this->ptr);
int offset = calculate_accessor_offset(indices, this->shape);
return data_ptr[offset];
}

public:
DataType data_type;
ArrayShape shape;
req<void const *> ptr;
};
FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR,
data_type,
shape,
ptr);
void *ptr;
DeviceType device_type;

std::string format_as(GenericTensorAccessorR const &);
std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
private:
std::tuple<decltype(data_type) const &,
decltype(shape) const &,
decltype(ptr) const &,
decltype(device_type) const &>
tie() const;
};

int32_t *get_int32_ptr(GenericTensorAccessorW const &);
int64_t *get_int64_ptr(GenericTensorAccessorW const &);
float *get_float_ptr(GenericTensorAccessorW const &);
double *get_double_ptr(GenericTensorAccessorW const &);
half *get_half_ptr(GenericTensorAccessorW const &);
std::vector<int32_t *>
get_int32_ptrs(std::vector<GenericTensorAccessorW> const &);
std::vector<int64_t *>
get_int64_ptrs(std::vector<GenericTensorAccessorW> const &);
std::vector<float *>
get_float_ptrs(std::vector<GenericTensorAccessorW> const &);
std::vector<double *>
get_double_ptrs(std::vector<GenericTensorAccessorW> const &);
std::vector<half *> get_half_ptrs(std::vector<GenericTensorAccessorW> const &);
std::string format_as(GenericTensorAccessorW const &);
std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);

static_assert(is_fmtable<req<DataType> const &>::value, "");

Expand Down Expand Up @@ -137,6 +242,21 @@
std::vector<half const *>
get_half_ptrs(std::vector<GenericTensorAccessorR> const &);

int32_t *get_int32_ptr(GenericTensorAccessorW const &);
int64_t *get_int64_ptr(GenericTensorAccessorW const &);
float *get_float_ptr(GenericTensorAccessorW const &);
double *get_double_ptr(GenericTensorAccessorW const &);
half *get_half_ptr(GenericTensorAccessorW const &);
std::vector<int32_t *>
get_int32_ptrs(std::vector<GenericTensorAccessorW> const &);
std::vector<int64_t *>
get_int64_ptrs(std::vector<GenericTensorAccessorW> const &);
std::vector<float *>
get_float_ptrs(std::vector<GenericTensorAccessorW> const &);
std::vector<double *>
get_double_ptrs(std::vector<GenericTensorAccessorW> const &);
std::vector<half *> get_half_ptrs(std::vector<GenericTensorAccessorW> const &);

template <DataType DT>
std::vector<real_type_t<DT> const *>
get(std::vector<GenericTensorAccessorR> const &accs) {
Expand All @@ -147,24 +267,24 @@
return out;
}

bool accessor_data_is_equal(GenericTensorAccessorR const &accessor_a,
GenericTensorAccessorR const &accessor_b);

bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
GenericTensorAccessorR const &accessor_b);

GenericTensorAccessorR read_only_accessor_from_write_accessor(
GenericTensorAccessorW const &write_accessor);

bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
GenericTensorAccessorW const &acc2);

bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
ArrayShape const &expected_shape,
DataType const &expected_dtype);
bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
GenericTensorAccessorR const &acc2);

bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
ArrayShape const &expected_shape,
DataType const &expected_dtype);

std::pair<ArrayShape, DataType>
get_shape_and_datatype(GenericTensorAccessorR const &accessor);
std::pair<ArrayShape, DataType>
get_shape_and_datatype(GenericTensorAccessorW const &accessor);

} // namespace FlexFlow

Expand Down
7 changes: 6 additions & 1 deletion lib/kernels/include/kernels/allocation.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#ifndef _FLEXFLOW_KERNELS_ALLOCATION_H
#define _FLEXFLOW_KERNELS_ALLOCATION_H

#include "accessor.h"
#include "kernels/accessor.h"
#include <cstddef>
#include <memory>

Expand All @@ -11,16 +11,21 @@ struct IAllocator {
virtual void *allocate(size_t) = 0;
virtual void deallocate(void *) = 0;

virtual DeviceType get_allocation_device_type() const = 0;

virtual ~IAllocator() = default;
};

struct Allocator {
Allocator() = delete;

GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape);

void *allocate(size_t mem_size);
void deallocate(void *ptr);

DeviceType get_allocation_device_type() const;

template <typename T, typename... Args>
static typename std::enable_if<std::is_base_of<IAllocator, T>::value,
Allocator>::type
Expand Down
2 changes: 1 addition & 1 deletion lib/kernels/include/kernels/array_shape.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace FlexFlow {
struct ArrayShape {
public:
ArrayShape() = delete;
ArrayShape(nonnegative_int *dims, nonnegative_int num_dims);
ArrayShape(nonnegative_int const *dims, nonnegative_int num_dims);
ArrayShape(TensorShape const &shape);
ArrayShape(std::vector<nonnegative_int> const &);

Expand Down
6 changes: 2 additions & 4 deletions lib/kernels/include/kernels/attention_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState,
std::string format_as(MHAPerDeviceState const &x);
std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x);

namespace Kernels {
namespace MultiHeadAttention {
namespace Kernels::MultiHeadAttention {

MHAPerDeviceState init_kernel(PerDeviceFFHandle const &,
Allocator &,
Expand Down Expand Up @@ -105,8 +104,7 @@ void backward_kernel(ffStream_t stream,
void cleanup_kernel(Allocator &allocator,
MHAPerDeviceState const &device_state);

} // namespace MultiHeadAttention
} // namespace Kernels
} // namespace Kernels::MultiHeadAttention
} // namespace FlexFlow

#endif
8 changes: 2 additions & 6 deletions lib/kernels/include/kernels/batch_matmul_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
#include "kernels/allocation.h"
#include "kernels/ff_handle.h"

namespace FlexFlow {
namespace Kernels {
namespace BatchMatmul {
namespace FlexFlow::Kernels::BatchMatmul {

void forward_kernel(ffStream_t stream,
PerDeviceFFHandle const &handle,
Expand Down Expand Up @@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream,
int k,
int batch);

} // namespace BatchMatmul
} // namespace Kernels
} // namespace FlexFlow
} // namespace FlexFlow::Kernels::BatchMatmul

#endif
Loading
Loading