Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
2dc4c60
Add allocators
reyna-abhyankar May 10, 2024
2488514
Computation Graph and Builder
reyna-abhyankar May 14, 2024
9a59f34
Shift ops and remove legion names
reyna-abhyankar May 14, 2024
931b47c
Format
reyna-abhyankar May 14, 2024
8a66ed9
Format
reyna-abhyankar May 14, 2024
3ffe239
Fix tracked allocator
reyna-abhyankar May 21, 2024
da10906
Fix comp graph
reyna-abhyankar May 22, 2024
ae864ae
Merge branch 'repo-refactor' into comp-graph
reyna-abhyankar May 22, 2024
30330b7
Merge branch 'repo-refactor' into local-allocator
reyna-abhyankar May 22, 2024
da701bf
Merge branch 'repo-refactor' into op-refactor
reyna-abhyankar May 22, 2024
15fbcc8
Merge branch 'local-allocator' into op-refactor
reyna-abhyankar May 22, 2024
db6e3ec
Merge branch 'comp-graph' into op-refactor
reyna-abhyankar May 22, 2024
784742c
Add task spec
reyna-abhyankar May 22, 2024
036dbf6
Merge branch 'comp-graph' into op-refactor
reyna-abhyankar May 22, 2024
5fbb6a3
Merge branch 'local-allocator' into op-refactor
reyna-abhyankar May 22, 2024
905bdd1
Minor build issues
reyna-abhyankar May 22, 2024
13e6ce2
Merge branch 'op-refactor' of github.com:reyna-abhyankar/FlexFlow int…
reyna-abhyankar May 22, 2024
3a3684e
Build op task spec
reyna-abhyankar May 22, 2024
a4dd9d4
Build ops and op task spec
reyna-abhyankar May 22, 2024
5bc719f
Simplify edge set obtain
reyna-abhyankar May 22, 2024
c8bb9ad
Merge branch 'comp-graph' into op-refactor
reyna-abhyankar May 22, 2024
583b2d3
Format
reyna-abhyankar May 22, 2024
e0e5fe2
Merge branch 'repo-refactor' into op-refactor
reyna-abhyankar May 28, 2024
269557e
Fixes
reyna-abhyankar May 28, 2024
be791ad
Merge branch 'repo-refactor' into op-refactor
reyna-abhyankar May 28, 2024
269770a
Fix conflicts, some renaming
reyna-abhyankar May 30, 2024
5093acb
Merge branch 'repo-refactor' into op-refactor
lockshaw May 30, 2024
2fbf291
Fix gather kernels
reyna-abhyankar May 30, 2024
a2a7e0a
Finish gather operator
reyna-abhyankar May 30, 2024
e0b259c
Format
reyna-abhyankar May 30, 2024
55971f2
Fix substitutions
reyna-abhyankar May 31, 2024
89afe2c
Merge branch 'repo-refactor' into op-refactor
reyna-abhyankar May 31, 2024
da38f0a
Fix legion dim in gather
reyna-abhyankar Jun 1, 2024
286c8ae
Merge branch 'repo-refactor' into op-refactor
lockshaw Jun 1, 2024
5f539a3
Format string fixes
reyna-abhyankar Jun 1, 2024
26ddf7f
Fix include
reyna-abhyankar Jun 1, 2024
1dfc24e
Gather backward time
reyna-abhyankar Jun 1, 2024
c60efd9
Format
reyna-abhyankar Jun 1, 2024
c7b48dd
Change datatype for linear kernels away from void *
dylanllim Jun 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lib/kernels/include/kernels/array_shape.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ struct ArrayShape {

ArrayShape reversed_dim_order() const;
ArrayShape sub_shape(std::optional<legion_dim_t> start,
std::optional<legion_dim_t> end);
std::optional<legion_dim_t> end) const;

public:
LegionTensorDims dims;
Expand Down
3 changes: 0 additions & 3 deletions lib/kernels/include/kernels/element_unary_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@

namespace FlexFlow {

using ElementUnaryUnifiedAttrs =
std::variant<ElementUnaryAttrs, ElementScalarUnaryAttrs>;

struct ElementUnaryPerDeviceState {
ffTensorDescriptor_t inputTensor, outputTensor;
req<ffActivationDescriptor_t> actiDesc;
Expand Down
24 changes: 11 additions & 13 deletions lib/kernels/include/kernels/gather_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,34 @@
#define _FLEXFLOW_OPS_KERNELS_GATHER_KERNELS_H

#include "accessor.h"
#include "device.h"
#include "kernels/device.h"

namespace FlexFlow {

struct GatherPerDeviceState {
int legion_dim;
req<DataType> index_data_type;
PerDeviceFFHandle handle;
legion_dim_t legion_dim;
};

FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState,
legion_dim,
index_data_type);
handle,
legion_dim);

namespace Kernels {
namespace Gather {

void forward_kernel(ffStream_t stream,
GatherPerDeviceState const &m,
GenericTensorAccessorR const &input,
GenericTensorAccessorR const &index,
GenericTensorAccessorW const &output,
size_t stride,
size_t input_dim_size,
size_t output_dim_size);
GenericTensorAccessorW const &output);

void backward_kernel(ffStream_t stream,
GatherPerDeviceState const &m,
GenericTensorAccessorR const &output_grad,
GenericTensorAccessorR const &index,
GenericTensorAccessorW const &input_grad,
size_t stride,
size_t input_dim_size,
size_t output_dim_size);
GenericTensorAccessorW const &input_grad);

} // namespace Gather
} // namespace Kernels
} // namespace FlexFlow
Expand Down
22 changes: 11 additions & 11 deletions lib/kernels/include/kernels/linear_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,22 +50,22 @@ bool use_activation(Activation activation);

void forward_kernel(ffStream_t stream,
LinearPerDeviceState const &m,
void const *input_ptr,
void *output_ptr,
void const *filter_ptr,
void const *bias_ptr,
float const *input_ptr,
float *output_ptr,
float const *filter_ptr,
float const *bias_ptr,
int in_dim,
int out_dim,
int batch_size);
void backward_kernel(ffStream_t stream,
LinearPerDeviceState const &m,
void const *input_ptr,
void *input_grad_ptr,
void const *output_ptr,
void *output_grad_ptr,
void const *kernel_ptr,
void *kernel_grad_ptr,
void *bias_ptr,
float const *input_ptr,
float *input_grad_ptr,
float const *output_ptr,
float *output_grad_ptr,
float const *kernel_ptr,
float *kernel_grad_ptr,
float *bias_ptr,
int in_dim,
int out_dim,
int batch_size);
Expand Down
163 changes: 80 additions & 83 deletions lib/kernels/src/cuda/ops/gather_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ template <typename IndexType>
__global__ void gather_forward(float const *input,
IndexType const *index,
float *output,
size_t output_size,
size_t stride,
size_t input_dim_size,
size_t output_dim_size) {
coord_t output_size,
coord_t stride,
coord_t input_dim_size,
coord_t output_dim_size) {
CUDA_KERNEL_LOOP(o, output_size) {
// output tensor shape: [*, output_dim_size, stride]
// output tensor stride: [output_dim_size * stride, stride, 1]
Expand All @@ -39,10 +39,10 @@ __global__ void gather_forward(float const *input,
// [outer_index, index[0], left_over]
// Therefore, input_index = outer_index * (stride * input_dim_size)
// + index[0] * stride + left_over;
size_t outer_index = o / (stride * output_dim_size);
coord_t outer_index = o / (stride * output_dim_size);
// coord_t index_2 = (o / stride) % dim_size
size_t left_over = o % stride;
size_t input_idx =
coord_t left_over = o % stride;
coord_t input_idx =
outer_index * (stride * input_dim_size) + index[o] * stride + left_over;
output[o] = input[input_idx];
}
Expand All @@ -52,10 +52,10 @@ template <typename IndexType>
__global__ void gather_backward(float const *output_grad,
IndexType const *index,
float *input_grad,
size_t output_size,
size_t stride,
size_t input_dim_size,
size_t output_dim_size) {
coord_t output_size,
coord_t stride,
coord_t input_dim_size,
coord_t output_dim_size) {
CUDA_KERNEL_LOOP(o, output_size) {
// output tensor shape: [*, output_dim_size, stride]
// output tensor stride: [output_dim_size * stride, stride, 1]
Expand All @@ -66,10 +66,10 @@ __global__ void gather_backward(float const *output_grad,
// [outer_index, index[0], left_over]
// Therefore, input_index = outer_index * (stride * input_dim_size)
// + index[0] * stride + left_over;
size_t outer_index = o / (stride * output_dim_size);
coord_t outer_index = o / (stride * output_dim_size);
// coord_t index_2 = (o / stride) % dim_size
size_t left_over = o % stride;
size_t input_idx =
coord_t left_over = o % stride;
coord_t input_idx =
outer_index * (stride * input_dim_size) + index[o] * stride + left_over;

atomicAdd(&input_grad[input_idx], output_grad[o]);
Expand All @@ -78,100 +78,97 @@ __global__ void gather_backward(float const *output_grad,

template <DataType IndexType>
struct ForwardKernel {
void operator()(cudaStream_t stream,
GatherPerDeviceState const &m,
void operator()(ffStream_t stream,
GenericTensorAccessorR const &input,
GenericTensorAccessorR const &index,
GenericTensorAccessorW const &output,
size_t stride,
size_t input_dim_size,
size_t output_dim_size) {
/*size_t stride = 1;
for (int i = 0; i < m->legion_dim; i++) {
stride *= (output.domain.hi()[i] - output.domain.lo()[i] + 1);
}
size_t dim_size =
output.domain.hi()[m->legion_dim] - output.domain.lo()[m->legion_dim] +
1;
*/
gather_forward<real_type<IndexType>>
<<<GET_BLOCKS(output.shape.get_volume()),
CUDA_NUM_THREADS,
0,
stream>>>(input.get<DataType::FLOAT>(),
index.get<IndexType>(),
output.get<DataType::FLOAT>(),
output.shape.get_volume(),
stride,
input_dim_size,
output_dim_size);
coord_t output_size,
coord_t stride,
coord_t input_dim_size,
coord_t output_dim_size) {
gather_forward<<<GET_BLOCKS(output_size), CUDA_NUM_THREADS, 0, stream>>>(
input.get_float_ptr(),
index.get<IndexType>(),
output.get_float_ptr(),
output_size,
stride,
input_dim_size,
output_dim_size);
}
};

void forward_kernel(cudaStream_t stream,
template <DataType IndexType>
struct BackwardKernel {
void operator()(ffStream_t stream,
GenericTensorAccessorR const &output_grad,
GenericTensorAccessorR const &index,
GenericTensorAccessorW const &input_grad,
coord_t output_size,
coord_t stride,
coord_t input_dim_size,
coord_t output_dim_size) {
gather_backward<<<GET_BLOCKS(output_size), CUDA_NUM_THREADS, 0, stream>>>(
output_grad.get_float_ptr(),
index.get<IndexType>(),
input_grad.get_float_ptr(),
output_size,
stride,
input_dim_size,
output_dim_size);
}
};

void forward_kernel(ffStream_t stream,
GatherPerDeviceState const &m,
GenericTensorAccessorR const &input,
GenericTensorAccessorR const &index,
GenericTensorAccessorW const &output,
size_t stride,
size_t input_dim_size,
size_t output_dim_size) {
DataTypeDispatch1<ForwardKernel>{}(m.index_data_type,
GenericTensorAccessorW const &output) {
checkCUDA(get_legion_stream(&stream));

coord_t stride =
output.shape
.sub_shape(std::nullopt, legion_dim_t{m.legion_dim.value() + 1})
.get_volume();
coord_t output_dim_size = output.shape[m.legion_dim];
coord_t input_dim_size = input.shape[m.legion_dim];

assert(index.data_type == DataType::INT32 ||
index.data_type == DataType::INT64);

DataTypeDispatch1<ForwardKernel>{}(index.data_type,
stream,
m,
input,
index,
output,
output.shape.get_volume(),
stride,
input_dim_size,
output_dim_size);
}

template <DataType IndexType>
struct BackwardKernel {
void operator()(cudaStream_t stream,
GatherPerDeviceState const &m,
GenericTensorAccessorR const &output_grad,
GenericTensorAccessorR const &index,
GenericTensorAccessorW const &input_grad,
size_t stride,
size_t input_dim_size,
size_t output_dim_size) {
/*size_t stride = 1;
for (int i = 0; i < m->legion_dim; i++) {
stride *= (output_grad.domain.hi()[i] - output_grad.domain.lo()[i] + 1);
}
size_t dim_size = output_grad.domain.hi()[m->legion_dim] -
output_grad.domain.lo()[m->legion_dim] + 1;
*/
gather_backward<real_type<IndexType>>
<<<GET_BLOCKS(output_grad.shape.get_volume()),
CUDA_NUM_THREADS,
0,
stream>>>(output_grad.get<DataType::FLOAT>(),
index.get<IndexType>(),
input_grad.get<DataType::FLOAT>(),
output_grad.shape.get_volume(),
stride,
input_dim_size,
output_dim_size);
}
};

void backward_kernel(cudaStream_t stream,
void backward_kernel(ffStream_t stream,
GatherPerDeviceState const &m,
GenericTensorAccessorR const &output_grad,
GenericTensorAccessorR const &index,
GenericTensorAccessorW const &input_grad,
size_t stride,
size_t input_dim_size,
size_t output_dim_size) {
DataTypeDispatch1<BackwardKernel>{}(m.index_data_type,
GenericTensorAccessorW const &input_grad) {
checkCUDA(get_legion_stream(&stream));

coord_t stride =
output_grad.shape
.sub_shape(std::nullopt, legion_dim_t{m.legion_dim.value() + 1})
.get_volume();
coord_t output_dim_size = output_grad.shape[m.legion_dim];
coord_t input_dim_size = input_grad.shape[m.legion_dim];

assert(index.data_type == DataType::INT32 ||
index.data_type == DataType::INT64);

DataTypeDispatch1<BackwardKernel>{}(index.data_type,
stream,
m,
output_grad,
index,
input_grad,
output_grad.shape.get_volume(),
stride,
input_dim_size,
output_dim_size);
Expand Down
Loading