diff --git a/csrc/include/opus/opus.hpp b/csrc/include/opus/opus.hpp index b0fa54d9e5..bab041bd2d 100644 --- a/csrc/include/opus/opus.hpp +++ b/csrc/include/opus/opus.hpp @@ -33,7 +33,7 @@ #endif #ifndef OPUS_TILE_CONTAINER -#define OPUS_TILE_CONTAINER 0 // 0:ext-vector 1:array +#define OPUS_TILE_CONTAINER 0 // 0:vector, 1:array of vector, 2:flattened array #endif namespace opus { @@ -153,10 +153,18 @@ template struct __make_index_seq >::seq_type>::seq_type; }; } // namespace impl - // make_index_seq<5> -> seq<0,1,2,3,4> | make_index_seq<4, 9> -> seq<4,5,6,7,8> | make_index_seq<4, 8, 2> -> seq<4, 6> template using make_index_seq = typename impl::__make_index_seq>::seq_type; +namespace impl { +template +struct __make_repeated_seq { + template static constexpr auto __make(seq) { return seq<(void(I), Value)...>{}; } + using seq_type = decltype(__make(make_index_seq{})); +}; +} // namespace impl +template using make_repeated_seq = typename impl::__make_repeated_seq::seq_type; + template OPUS_H_D constexpr auto concat_seq(seq, seq) { return seq{}; } namespace impl { @@ -212,10 +220,10 @@ template struct tuple; template OPUS_H_D constexpr void static_ford(tuple...>, F f) { impl::static_ford_impl>{}(f); } ///////////////////////////////////////////////////////////////////////////////////////////////////////// -// array, enhanced C like array style. convenient for cases like assign one array to another +// array, enhanced C like array style template struct array { - using value_type = remove_cv_t; + using value_type = remove_cvref_t; using type = array; #if 0 // don't define following, just let me be trivially copyable class OPUS_H_D constexpr array() = default; @@ -235,7 +243,9 @@ struct array { OPUS_H_D static constexpr bool empty() { return size() == 0; } OPUS_H_D static constexpr index_t size() { return N; } - value_type content[N]; + // we need this "content" member to have a default value, so that the implicitly defined constructor could be constexpr + // see: https://en.cppreference.com/w/cpp/language/constexpr.html#constexpr_constructor + value_type content[N] {}; }; template @@ -348,6 +358,12 @@ OPUS_H_D constexpr decltype(auto) get(T&& t) { return get(get(std template OPUS_H_D constexpr auto make_tuple(T&&... xs) { return tuple...>(std::forward(xs)...); } +namespace impl { +template OPUS_H_D constexpr auto make_repeated_tuple(T&& x, seq) { return opus::make_tuple((void(Is), std::forward(x))...); } +} // namespace impl +template OPUS_H_D constexpr auto make_repeated_tuple(T&& x) { return impl::make_repeated_tuple(std::forward(x), make_index_seq{}); } +template OPUS_H_D constexpr auto make_repeated_tuple(T&& x, number) { return impl::make_repeated_tuple(std::forward(x), make_index_seq{}); } + namespace impl { template OPUS_H_D constexpr auto concat_tuple(T0 const& t0, T1 const& t1, seq, seq) { return opus::make_tuple(get(t0)..., get(t1)...); } @@ -374,35 +390,6 @@ template static constexpr bool is_tuple_v = is_tuple OPUS_H_D constexpr std::enable_if_t, index_t> size(T&&) { return remove_cvref_t::size(); /* tuple size */} template OPUS_H_D constexpr std::enable_if_t, index_t> size() { return remove_cvref_t::size(); /* tuple size */} -namespace impl { -template struct to_peepholed_seq; - -template struct to_peepholed_seq, max_income_num> { - template OPUS_H_D constexpr auto operator()(number) { - constexpr auto next_cumulative = std::conditional_t(PeepholedTuple{}))>>, - number<(C+1) < max_income_num::value ? (C+1) : C>, number>{}; - return concat_seq(seq{}, to_peepholed_seq, max_income_num>{}(next_cumulative) ); - } -}; -template struct to_peepholed_seq, max_income_num> { - template OPUS_H_D constexpr auto operator()(number) { return seq{}; } -}; - -template -OPUS_H_D constexpr decltype(auto) merge_peepholed_tuple_impl(PeepholedTuple&& pt, IncomTuple&& it, seq, seq) { - return opus::make_tuple([&](){ if constexpr (is_underscore_v(pt))>>) return get(it); - else return get(pt);}()... ); -} -} -// (Peepholed)tuple<*, *, _, *, _> + (Income)tuple<#, @> -> tuple<*, *, #, *, @>. "_"(underscore) indicate a peephole for income tuple to chime in -template -OPUS_H_D constexpr decltype(auto) merge_peepholed_tuple(PeepholedTuple&& pt, IncomeTuple&& it) { - constexpr auto income_seq = impl::to_peepholed_seq< remove_cvref_t, - make_index_seq()>, - number()> >{}(number<0>{}); - return impl::merge_peepholed_tuple_impl(std::forward(pt), std::forward(it), make_index_seq()>{}, income_seq); -} - template , bool> = true> OPUS_H_D constexpr auto explode_tuple(const T& t) { return opus::make_tuple(t); } template OPUS_H_D constexpr auto explode_tuple(const T&, seq); template , bool> = true> OPUS_H_D constexpr auto explode_tuple(const T& t) { return explode_tuple(t, make_index_seq()>{}); } @@ -416,7 +403,7 @@ template OPUS_H_D constexpr auto embed_nested_tuple_impl(const Outer& ot, const Inner& it, seq) { return opus::make_tuple(concat_tuple(get(ot), get(it))...); } template -OPUS_H_D constexpr auto tuple_count_impl(const T& t, seq) { return (number(t))>, remove_cvref_t> ? 1 : 0>{} + ...); } +OPUS_H_D constexpr auto tuple_count_impl(seq) { return (number(T{}))>, remove_cvref_t> ? 1 : 0>{} + ...); } } // Outer: tuple, tuple>, Inner: tuple, tuple> => tuple, tuple> template @@ -425,8 +412,11 @@ OPUS_H_D constexpr auto embed_nested_tuple(const Outer& ot, const Inner& it) { return impl::embed_nested_tuple_impl(ot, it, make_index_seq()>{}); } -template< typename TargetType, typename T> -OPUS_H_D constexpr index_t tuple_count(const T& t) { return impl::tuple_count_impl(t, make_index_seq()>{}).value; } +template< typename TargetType, typename T, std::enable_if_t, bool> = true> +OPUS_H_D constexpr index_t tuple_count(const T& t) { return impl::tuple_count_impl>(make_index_seq()>{}).value; } + +template< typename TargetType, typename T, std::enable_if_t, bool> = true> +OPUS_H_D constexpr index_t tuple_count() { return impl::tuple_count_impl>(make_index_seq()>{}).value; } template OPUS_H_D constexpr auto seq_to_tuple(seq) { return opus::make_tuple(number{}...); } @@ -447,17 +437,55 @@ OPUS_H_D constexpr auto reduce_tuple(const T & t) { return impl::reduce_tuple_i template, bool> = true> OPUS_H_D constexpr auto reduce_tuple_sum(const T & t) { return reduce_tuple(t); } template, bool> = true> OPUS_H_D constexpr auto reduce_tuple_mul(const T & t) { return reduce_tuple(t); } +namespace impl { +template struct to_peepholed_seq; + +template struct to_peepholed_seq, max_income_num> { + template OPUS_H_D constexpr auto operator()(number) { + constexpr auto next_cumulative = std::conditional_t(PeepholedTuple{}))>>, + number<(C+1) < max_income_num::value ? (C+1) : C>, number>{}; + return concat_seq(seq{}, to_peepholed_seq, max_income_num>{}(next_cumulative) ); + } +}; +template struct to_peepholed_seq, max_income_num> { + template OPUS_H_D constexpr auto operator()(number) { return seq{}; } +}; + +template +OPUS_H_D constexpr decltype(auto) merge_peepholed_tuple_impl(PeepholedTuple&& pt, IncomTuple&& it, seq, seq) { + return opus::make_tuple([&](){ if constexpr (is_underscore_v(pt))>>) return get(it); + else return get(pt);}()... ); +} +} +// (Peepholed)tuple<*, *, _, *, _> + (Income)tuple<#, @> -> tuple<*, *, #, *, @>. "_"(underscore) indicate a peephole for income tuple to chime in +template +OPUS_H_D constexpr decltype(auto) merge_peepholed_tuple(PeepholedTuple&& pt, IncomeTuple&& it) { + if constexpr (tuple_count() == 0) return pt; + else { + constexpr auto income_seq = impl::to_peepholed_seq< remove_cvref_t, make_index_seq()>, + number()> >{}(number<0>{}); + return impl::merge_peepholed_tuple_impl(std::forward(pt), std::forward(it), make_index_seq()>{}, income_seq); + } +} +} // namespace opus + +// implementing the "tuple-like binding protocol", don't use below directly +namespace std { +template struct tuple_size> : std::integral_constant {}; +template struct tuple_size> : std::integral_constant {}; +template struct tuple_element> : std::tuple_element> {}; +template struct tuple_element> : std::tuple_element> {}; +} // namespace std + +namespace opus { ///////////////////////////////////////////////////////////////////////////////////////////////////////// // transforms template constexpr auto embed(const X& x, const Y& y, seq) { return ( ... + (get(x) * get(y))); } template constexpr auto embed(const X& x, const Y& y) { return embed(x, y, make_index_seq{}); } namespace impl { -template -OPUS_H_D constexpr auto transform_tuple_impl(F f, const X& x, seq) { return opus::make_tuple(f(get(x))...); } - -template -OPUS_H_D constexpr auto transform_tuple_with_idx_impl(F f, const X& x, seq) { return opus::make_tuple(f(get(x), number{})...); } +template OPUS_H_D constexpr auto transform_tuple_impl(F f, const X& x, seq) { return opus::make_tuple(f(get(x))...); } +template OPUS_H_D constexpr auto transform_tuple_with_idx_impl(F f, const X& x, seq) { return opus::make_tuple(f(get(x), number{})...); } } // namespace impl // f(auto item) template OPUS_H_D constexpr auto transform_tuple(F f, const X& x) { return impl::transform_tuple_impl(f, x, make_index_seq()>{}); } @@ -500,8 +528,8 @@ struct layout : public tuple, remove_cvref_t, re else return rank - tuple_count(Coord{}); }(); - OPUS_H_D constexpr layout(const Shape& shape, const Stride& stride, const Coord& coord = {}) : base(shape, stride, coord), linear_offset(0){} - OPUS_H_D constexpr layout(Shape&& shape, Stride&& stride, Coord&& coord = {}) : base(shape, stride, coord), linear_offset(0){} + OPUS_H_D constexpr layout(const Shape& shape, const Stride& stride, const Coord& coord = {}) : base(shape, stride, coord){} + OPUS_H_D constexpr layout(Shape&& shape, Stride&& stride, Coord&& coord = {}) : base(shape, stride, coord){} // get ith element from shape/stride. if no I, then get the shape/stride as tuple template OPUS_H_D constexpr decltype(auto) shape() { return get<0,I...>(static_cast(*this)); } @@ -516,29 +544,129 @@ struct layout : public tuple, remove_cvref_t, re template , bool> = true> OPUS_H_D constexpr decltype(auto) operator()(InCoord&& c) const { - if constexpr (std::is_same_v) return linear_offset + coord_to_linear(*this, c); - else return linear_offset + coord_to_linear(*this, merge_peepholed_tuple(coord(), c)); - } + if constexpr (std::is_same_v) return coord_to_linear(*this, c); + else return coord_to_linear(*this, merge_peepholed_tuple(coord(), c)); } +}; + +template struct layout_linear; +template struct layout_cached; + +// use cached_vec to dispatch which layout implementation. cached_vec < 0 : "layout", cached_vec == 0 : "layout_linear", cached_vec > 0 : "layout_cached" +template OPUS_H_D constexpr auto make_layout(Sx&& s, Sy&& t) { + if constexpr (cached_vec < 0) return layout(std::forward(s), std::forward(t)); + else if constexpr (cached_vec == 0) return layout_linear>(std::forward(s), std::forward(t)); + else return layout_cached>(std::forward(s), std::forward(t)); } +template +OPUS_H_D constexpr auto make_layout(Sx&& s, Sy&& t, Sz&& c) { + if constexpr (cached_vec < 0) return layout(std::forward(s), std::forward(t), std::forward(c)); + if constexpr (cached_vec == 0) return layout_linear>(std::forward(s), std::forward(t), std::forward(c)); + else return layout_cached>(std::forward(s), std::forward(t), std::forward(c)); } +template && ...), bool> = true> +OPUS_H_D constexpr auto make_layout(Ts&&... ss) { return make_layout(opus::make_tuple(ss...), packed_shape_to_stride(opus::make_tuple(ss...))); } +template OPUS_H_D constexpr auto make_layout(S&& s) { return make_layout(std::forward(s), packed_shape_to_stride(s)); } + +template OPUS_H_D constexpr auto make_layout_packed(S&& s) { return make_layout(std::forward(s), packed_shape_to_stride(s)); } // same as single arg make_layout +template OPUS_H_D constexpr auto make_layout_packed(Sx&& s, Sz&& c) { return make_layout(std::forward(s), packed_shape_to_stride(s), std::forward(c)); } + +template +struct layout_linear : public remove_cvref_t{ + using base = remove_cvref_t; + + template + OPUS_H_D constexpr layout_linear(const Shape& shape, const Stride& stride, const Coord& coord = {}) : base(shape, stride, coord), linear_offset(0){} + + template + OPUS_H_D constexpr layout_linear(Shape&& shape, Stride&& stride, Coord&& coord = {}) : base(shape, stride, coord), linear_offset(0){} + + template && ...), bool> = true> + OPUS_H_D constexpr decltype(auto) operator()(Cs&&... cs) const { return this->operator()(opus::make_tuple(std::forward(cs)...)); } + + template , bool> = true> + OPUS_H_D constexpr decltype(auto) operator()(InCoord&& c) const { + if constexpr (std::is_same_v) return linear_offset + coord_to_linear(*this, c); + else return linear_offset + coord_to_linear(*this, merge_peepholed_tuple(base::coord(), c)); } OPUS_H_D constexpr void inc(index_t offset) { linear_offset += offset; } - OPUS_H_D constexpr layout& operator+=(index_t offset) { inc(offset); return *this; } + OPUS_H_D constexpr layout_linear& operator+=(index_t offset) { inc(offset); return *this; } index_t linear_offset; }; +template OPUS_H_D constexpr auto layout_to_vectorized_issue_space(); +template OPUS_H_D constexpr auto layout_to_offsets(const Layout& u); + +template +struct layout_cached : public remove_cvref_t { + using base = remove_cvref_t; + static constexpr index_t cached_vec = cached_vec_; + + static constexpr auto issue_space_vec = layout_to_vectorized_issue_space(); + static constexpr index_t num_issues = get<0>(reduce_tuple_mul(issue_space_vec)).value; + + template + OPUS_H_D constexpr layout_cached(const Shape& shape, const Stride& stride, const Coord& coord = {}) : base(shape, stride, coord), offsets{layout_to_offsets(static_cast(*this))}{} + + template + OPUS_H_D constexpr layout_cached(Shape&& shape, Stride&& stride, Coord&& coord = {}) : base(shape, stride, coord), offsets{layout_to_offsets(static_cast(*this))}{} + + template && ...), bool> = true> + OPUS_H_D constexpr decltype(auto) operator()(Cs&&... cs) const { return this->operator()(opus::make_tuple(std::forward(cs)...)); } + + template , bool> = true> + OPUS_H_D constexpr decltype(auto) operator()(InCoord&& c) const { constexpr auto u_linear = make_layout<-1>(issue_space_vec); return offsets[u_linear(c)]; } + + OPUS_H_D constexpr void inc(index_t offset) { static_for([&](auto i){ offsets[i] += offset; }); } + OPUS_H_D constexpr layout_cached& operator+=(index_t offset) { inc(offset); return *this; } + + array offsets; +}; + template struct is_layout : false_type {}; template struct is_layout> : true_type {}; +template struct is_layout> : true_type {}; +template struct is_layout> : true_type {}; template constexpr bool is_layout_v = is_layout>::value; -template OPUS_H_D constexpr auto make_layout(Sx&& s, Sy&& t) { return layout(std::forward(s), std::forward(t)); } -template -OPUS_H_D constexpr auto make_layout(Sx&& s, Sy&& t, Sz&& c) { return layout(std::forward(s), std::forward(t), std::forward(c)); } -template && ...), bool> = true> -OPUS_H_D constexpr auto make_layout(Ts&&... ss) { return make_layout(opus::make_tuple(ss...), packed_shape_to_stride(opus::make_tuple(ss...))); } -template OPUS_H_D constexpr auto make_layout(S&& s) { return make_layout(std::forward(s), packed_shape_to_stride(s)); } +template +OPUS_H_D constexpr auto layout_to_issue_space() { + using maybe_coord = std::conditional_t, typename Layout::Shape, typename Layout::Coord>; + using issue_space_y = remove_cvref_t; + using single_issue_space = remove_cvref_t{}, number()>{}))>; + using fallback_issue_space_y = std::conditional_t>, single_issue_space, issue_space_y>; + using issue_space = std::conditional_t, single_issue_space, fallback_issue_space_y>; + return issue_space{}; +} + +template +OPUS_H_D constexpr auto vectorize_issue_space(issue_space, number = {}) { + constexpr index_t vec_from_issue_space = get() - 1>(issue_space{}).value; // here we get the original last dim length(which should be y dim) + static_assert(vec_from_issue_space % vec == 0, "please make sure requested vec size can be dividable of vec from issue space"); -template OPUS_H_D constexpr auto make_layout_packed(S&& s) { return make_layout(std::forward(s), packed_shape_to_stride(s)); } // same as single arg make_layout -template OPUS_H_D constexpr auto make_layout_packed(Sx&& s, Sz&& c) { return make_layout(std::forward(s), packed_shape_to_stride(s), std::forward(c)); } + constexpr auto issue_space_vec = transform_tuple_with_idx([&](auto item, auto index){ // modify the last dim, divide it by vec. Result is still a tuple + if constexpr (index.value == size() - 1) return number{}; + else return item; }, issue_space{}); + return issue_space_vec; +} + +template +OPUS_H_D constexpr auto layout_to_vectorized_issue_space() { + constexpr auto issue_space = layout_to_issue_space(); + constexpr auto issue_space_vec = vectorize_issue_space(issue_space, number{}); + static_assert(size() == Layout::coord_rank); + return issue_space_vec; +} + +// this function is usually not constexpr. pre-compute all the offset under current layout +template +OPUS_H_D constexpr auto layout_to_offsets(const Layout& u) { + constexpr auto issue_space_vec = layout_to_vectorized_issue_space(); + constexpr index_t num_issues = get<0>(reduce_tuple_mul(issue_space_vec)).value; + array offsets; + + constexpr auto u_linear = make_layout<-1>(issue_space_vec); + static_ford(issue_space_vec, [&](auto ... ids){ offsets[u_linear(ids...)] = u(ids...); }); + return offsets; +} ///////////////////////////////////////////////////////////////////////////////////////////////////////// // vector, a wrapper for __attribute__((ext_vector_type(*))) @@ -578,6 +706,12 @@ template using vector_return_type = opus::vector_ } template constexpr impl::vector_return_type make_vector(Types&&... t) { return {std::forward(t)...}; } +namespace impl { +template OPUS_H_D constexpr auto make_repeated_vector(T&& x, seq) { return opus::make_vector((void(Is), std::forward(x))...); } +} // namespace impl +template OPUS_H_D constexpr auto make_repeated_vector(T&& x) { return impl::make_repeated_vector(std::forward(x), make_index_seq{}); } +template OPUS_H_D constexpr auto make_repeated_vector(T&& x, number) { return impl::make_repeated_vector(std::forward(x), make_index_seq{}); } + // vector type can't return reference! error: non-const reference cannot bind to vector element template , bool> = true> OPUS_H_D constexpr typename vector_traits::dtype get(T const& t) { static_assert(I < vector_traits::size()); return t[I]; } template , bool> = true> OPUS_H_D constexpr typename vector_traits::dtype get(T&& t) { static_assert(I < vector_traits::size()); return t[I]; } @@ -630,19 +764,15 @@ OPUS_H_D constexpr auto to_vector(const T& t) { return impl::to_vector_impl(t, m ///////////////////////////////////////////////////////////////////////////////////////////////////////// // slice namespace impl { -template, bool> = true> OPUS_H_D constexpr auto slice_impl(C&& container, seq) { return opus::make_vector(get(container)...); } -template, bool> = true> OPUS_H_D constexpr auto slice_impl(C&& container, seq) { return opus::make_array(get(container)...); } -template, bool> = true> OPUS_H_D constexpr auto slice_impl(C&& container, seq) { return opus::make_tuple(get(container)...); } +template, bool> = true> OPUS_H_D constexpr auto slice_impl(C&& c, seq) { return opus::make_vector(get(c)...); } +template, bool> = true> OPUS_H_D constexpr auto slice_impl(C&& c, seq) { return opus::make_array(get(c)...); } +template, bool> = true> OPUS_H_D constexpr auto slice_impl(C&& c, seq) { return opus::make_tuple(get(c)...); } template, bool> = true> -OPUS_H_D constexpr auto slice_impl_i(C&& container, Ts... ss) { - vector_t::dtype, len> r; index_t d = 0; static_for([&](auto i){r[d++] = container[i]; }, ss...); return r; -} +OPUS_H_D constexpr auto slice_impl_i(C&& c, Ts... ss) { vector_t::dtype, len> r; index_t d = 0; static_for([&](auto i){r[d++] = c[i]; }, ss...); return r; } template, bool> = true> -OPUS_H_D constexpr auto slice_impl_i(C&& container, Ts... ss) { - array r; index_t d = 0; static_for([&](auto i){r[d++] = container[i]; }, ss...); return r; -} +OPUS_H_D constexpr auto slice_impl_i(C&& c, Ts... ss) { array r; index_t d = 0; static_for([&](auto i){r[d++] = c[i]; }, ss...); return r; } template || is_array_v || is_tuple_v), bool> = true> OPUS_H_D constexpr auto set_slice_impl(C&& dst_c, V&& src_c, seq, seq) { (( dst_c[Ds] = src_c[Ss]), ...); } @@ -651,19 +781,19 @@ OPUS_H_D constexpr auto set_slice_impl(C&& dst_c, V&& src_c, seq, seq, or const integer. Note tuple type does not support dynamic slice (ss is integral) // (1).[end] : 0.... end, (2).[start, end] : start...end, (3).[start, end, step], start...end but with step as interval (default is 1) template && (is_constant_v && ...), bool> = true> -OPUS_H_D constexpr auto slice(C&& container, S&&...ss) { return impl::slice_impl(std::forward(container), make_index_seq<(S::value) ...>{}); } +OPUS_H_D constexpr auto slice(C&& c, S&&...ss) { return impl::slice_impl(std::forward(c), make_index_seq<(S::value) ...>{}); } template && (std::is_integral_v && ...), bool> = true> -OPUS_H_D constexpr auto slice(C&& container, S&&...ss) { return impl::slice_impl_i(std::forward(container), ss...); } +OPUS_H_D constexpr auto slice(C&& c, S&&...ss) { return impl::slice_impl_i(std::forward(c), ss...); } template && (is_constant_v && ...), bool> = true> -OPUS_H_D constexpr auto slice(C&& container, S&&...ss) { return impl::slice_impl(std::forward(container), make_index_seq<(S::value) ...>{}); } +OPUS_H_D constexpr auto slice(C&& c, S&&...ss) { return impl::slice_impl(std::forward(c), make_index_seq<(S::value) ...>{}); } template && (std::is_integral_v && ...), bool> = true> -OPUS_H_D constexpr auto slice(C&& container, S&&...ss) { return impl::slice_impl_i(std::forward(container), ss...); } +OPUS_H_D constexpr auto slice(C&& c, S&&...ss) { return impl::slice_impl_i(std::forward(c), ss...); } template && (is_constant_v && ...), bool> = true> -OPUS_H_D constexpr auto slice(C&& container, S&&...ss) { return impl::slice_impl(std::forward(container), make_index_seq<(S::value) ...>{}); } +OPUS_H_D constexpr auto slice(C&& c, S&&...ss) { return impl::slice_impl(std::forward(c), make_index_seq<(S::value) ...>{}); } template || is_array_v || is_tuple_v) && (is_constant_v && ...), bool> = true> OPUS_H_D constexpr auto set_slice(C&& dst_c, V&& src_c, S&&...ss) { @@ -701,6 +831,8 @@ REGISTER_DTYPE(i16 , int16_t) REGISTER_DTYPE(i8 , int8_t) REGISTER_DTYPE(u8 , uint8_t) +template && (is_constant_v && ...), bool> = true> +OPUS_H_D constexpr auto slice(C&& container, S&&...ss) { return container; } // TODO: fallback slice a normal value does nonthing ///////////////////////////////////////////////////////////////////////////////////////////////////////// // type cast OPUS_D bf16_t fp32_to_bf16_rtn_asm(const float& x) { @@ -827,31 +959,25 @@ struct gmem { template || is_dtype_v || is_array_v), bool> = true> // os in unit of T and cast to vector with vec OPUS_D void store(const V& x, int v_os, int s_os = 0, number = {}) { static_assert(std::is_same_v::dtype, scalar_type>, "scalar type must be same for the data to be stored" ); - static_assert((vec * vector_size) == vector_traits::size(), "vector size need to be same, please check" ); - _store(x, v_os * sizeof(T), s_os * sizeof(T), number{}); + if constexpr (is_dtype_v && (vec * vector_size) % vector_traits::size() == 0) { + _store(make_repeated_vector(x, number::size()>{}), v_os * sizeof(T)); + } else { + static_assert((vec * vector_size) == vector_traits::size(), "vector size need to be same, please check" ); + _store(x, v_os * sizeof(T)); + } } // bulk load API, give me a Shape of this tile, will issue multiple load instruction based on the y-shape space template, bool> = true> OPUS_D auto load(const Layout& u, int s_os = 0/* do we really need this? */, number = {}) { - using maybe_coord = std::conditional_t, typename Layout::Shape, typename Layout::Coord>; - constexpr auto issue_space_y = pickup_shape(typename Layout::Shape{}, maybe_coord{}, underscore{}); - using issue_space = std::conditional_t, typename Layout::Shape, remove_cvref_t>; - - constexpr index_t vec_from_issue_space = get() - 1>(issue_space{}).value; // here we get the original last dim length(which should be y dim) - static_assert(vec_from_issue_space % vec == 0, "please make sure requested vec size can be dividable of vec from issue space"); - - constexpr auto issue_space_vec = transform_tuple_with_idx([&](auto item, auto index){ // modify the last dim, divide it by vec. Result is still a tuple - if constexpr (index.value == size() - 1) return number{}; - else return item; }, issue_space{}); - - static_assert(size() == Layout::coord_rank); - constexpr index_t r_elem = [&](){ index_t n = 1; static_for()>([&](auto i){ n *= get(issue_space_vec); }); return n; }(); + constexpr auto issue_space = layout_to_issue_space(); + constexpr auto issue_space_vec = vectorize_issue_space(issue_space, number{}); + constexpr auto r_elem = get<0>(reduce_tuple_mul(issue_space_vec)); #if OPUS_TILE_CONTAINER == 0 - constexpr auto u_r = make_layout(issue_space{}); // we use this layout to describe the register layout - vector_t r; // local scratch to host the loaded register, and return it + constexpr auto u_r = make_layout<-1>(issue_space); // we use this layout to describe the register layout + vector_t r; // local scratch to host the loaded register, and return it static_ford(issue_space_vec, [&](auto ... ids){ auto tmp = load(u(ids...), s_os, number{}); constexpr index_t u_rs = u_r(ids...); @@ -859,8 +985,8 @@ struct gmem { }); return r; #elif OPUS_TILE_CONTAINER == 1 - constexpr auto u_r = make_layout(issue_space_vec); // we use this layout to describe the register layout - array, r_elem> r; // local scratch to host the loaded register, and return it + constexpr auto u_r = make_layout<-1>(issue_space_vec); // we use this layout to describe the register layout + array, r_elem.value> r; // local scratch to host the loaded register, and return it static_ford(issue_space_vec, [&](auto ... ids){ r[u_r(ids...)] = load(u(ids...), s_os, number{}); }); // issue the loading instruction multiple times return r; #endif @@ -869,22 +995,14 @@ struct gmem { template || is_vector_v) && is_layout_v), bool> = true> OPUS_D void store(const V& x, const Layout& u, int s_os = 0/* do we really need this? */, number = {}) { - using maybe_coord = std::conditional_t, typename Layout::Shape, typename Layout::Coord>; - constexpr auto issue_space_y = pickup_shape(typename Layout::Shape{}, maybe_coord{}, underscore{}); - using issue_space = std::conditional_t, typename Layout::Shape, remove_cvref_t>; - - constexpr index_t vec_from_issue_space = get() - 1>(issue_space{}).value; // here we get the original last dim length(which should be y dim) - static_assert(vec_from_issue_space % vec == 0, "please make sure requested vec size can be dividable of vec from issue space"); + constexpr auto issue_space = layout_to_issue_space(); + constexpr auto issue_space_vec = vectorize_issue_space(issue_space, number{}); - constexpr auto issue_space_vec = transform_tuple_with_idx([&](auto item, auto index){ // modify the last dim, divide it by vec. Result is still a tuple - if constexpr (index.value == size() - 1) return number{}; - else return item; }, issue_space{}); - - static_assert(size() == Layout::coord_rank); - - constexpr auto u_r = make_layout(issue_space{}); // we use this layout to describe the register layout + constexpr auto u_r = make_layout<-1>(issue_space); // we use this layout to describe the register layout #if OPUS_TILE_CONTAINER == 0 - auto a_ = x; + auto a_ = [&](){ if constexpr (is_array_v) return to_vector(x); + else if constexpr (is_dtype_v) return make_repeated_vector(x, number(reduce_tuple_mul(issue_space)).value>{}); + else if constexpr (is_vector_v) return x; }(); #elif OPUS_TILE_CONTAINER == 1 auto a_ = to_array(x); #endif @@ -896,8 +1014,97 @@ struct gmem { __amdgpu_buffer_rsrc_t cached_rsrc; }; +template OPUS_D decltype(auto) make_gmem(const T_* ptr, uint32_t size = 0xffffffff, uint32_t config = buffer_default_config()) { return gmem{ptr, size, config}; } +///////////////////////////////////////////////////////////////////////////////////////////////////////// +// smem load/store related. TODO: tr_load template -OPUS_D decltype(auto) make_gmem(const T_* ptr, uint32_t size = 0xffffffff, uint32_t config = buffer_default_config()) { return gmem{ptr, size, config}; } +struct smem { + using T = remove_cvref_t; + using scalar_type = typename vector_traits::dtype; + static constexpr index_t vector_size = vector_traits::size(); + template using vector_type = vector_t; + + OPUS_D smem(void* ptr_) : ptr(reinterpret_cast(ptr_)) {} + + template OPUS_D auto _load(int v_os/* in unit of byte*/) { using type = vector_type; return *reinterpret_cast(ptr + v_os); } + + template + OPUS_D void _store(const V& x, int v_os/* in unit of byte*/) { + static_assert((vec * vector_size) == vector_traits::size(), "vector size need to be same, please check"); + using type = vector_type; + *reinterpret_cast(ptr + v_os) = __builtin_bit_cast(type, x); + } + + template OPUS_D auto load(int v_os) { return _load(v_os * sizeof(T)); } + + template || is_dtype_v || is_array_v), bool> = true> + OPUS_D void store(const V& x, int v_os) { + static_assert(std::is_same_v::dtype, scalar_type>, "scalar type must be same for the data to be stored" ); + if constexpr (is_dtype_v && (vec * vector_size) % vector_traits::size() == 0) { + _store(make_repeated_vector(x, number::size()>{}), v_os * sizeof(T)); + } else { + static_assert((vec * vector_size) == vector_traits::size(), "vector size need to be same, please check" ); + _store(x, v_os * sizeof(T)); + } + } + + // bulk load API, give me a Shape of this tile, will issue multiple load instruction based on the y-shape space + template, bool> = true> + OPUS_D auto load(const Layout& u) + { + constexpr auto issue_space = layout_to_issue_space(); + constexpr auto issue_space_vec = vectorize_issue_space(issue_space, number{}); + constexpr auto r_elem = get<0>(reduce_tuple_mul(issue_space_vec)); + +#if OPUS_TILE_CONTAINER == 0 + constexpr auto u_r = make_layout<-1>(issue_space); // we use this layout to describe the register layout + vector_t r; // local scratch to host the loaded register, and return it + static_ford(issue_space_vec, [&](auto ... ids){ + auto tmp = load(u(ids...)); + constexpr index_t u_rs = u_r(ids...); + set_slice(r, tmp, number{}, number{}); + }); + return r; +#elif OPUS_TILE_CONTAINER == 1 + constexpr auto u_r = make_layout<-1>(issue_space_vec); // we use this layout to describe the register layout + array, r_elem.value> r; // local scratch to host the loaded register, and return it + static_ford(issue_space_vec, [&](auto ... ids){ r[u_r(ids...)] = load(u(ids...)); }); // issue the loading instruction multiple times + return r; +#endif + } + + template || is_dtype_v || is_vector_v) && is_layout_v), bool> = true> + OPUS_D void store(const V& x, const Layout& u) + { + constexpr auto issue_space = layout_to_issue_space(); + constexpr auto issue_space_vec = vectorize_issue_space(issue_space, number{}); + + constexpr auto u_r = make_layout<-1>(issue_space); // we use this layout to describe the register layout +#if OPUS_TILE_CONTAINER == 0 + auto a_ = [&](){ if constexpr (is_array_v) return to_vector(x); + else if constexpr (is_dtype_v) return make_repeated_vector(x, number(reduce_tuple_mul(issue_space)).value>{}); + else if constexpr (is_vector_v) return x; }(); +#elif OPUS_TILE_CONTAINER == 1 + auto a_ = to_array(x); +#endif + static_ford(issue_space_vec, [&](auto ... ids){ // issue the loading instruction multiple times + auto v_ = slice(a_, number{}, number{}); + store(v_, u(ids...)); + }); + } + char * ptr; // in unit of byte +}; + +template OPUS_D decltype(auto) make_smem(T_* ptr) { return smem{ptr}; } +///////////////////////////////////////////////////////////////////////////////////////////////////////// +// waitcnt +// vmcnt=0~63([15:14],[3:0]), lgkmcnt=0~15([11:8]), expcnt=0~7([6:4]) +template +OPUS_D void s_waitcnt(number, number, number = {}) +{ __builtin_amdgcn_s_waitcnt((((0b110000 & vmcnt) << (14 - 4)) | (0b1111 & vmcnt)) | ((0b111 & expcnt) << 4) | ((0b1111 & lgkmcnt) << 8)); } + +template OPUS_D void s_waitcnt_vmcnt(number) { s_waitcnt(number{}, number<15>{}); } +template OPUS_D void s_waitcnt_lgkmcnt(number) { s_waitcnt(number<63>{}, number{}); } ///////////////////////////////////////////////////////////////////////////////////////////////////////// // mfma @@ -1028,7 +1235,6 @@ OPUS_D constexpr auto unfold_p_coord(const Dim&, const Coord& coord) { return unfold_p_coord_impl(flatten_dim, coord, number<0>{}, make_index_seq()>{}); } -// template OPUS_D constexpr auto unfold_x_stride(const Dim&, const Shape&, const Stride& stride) { constexpr auto flatten_dim = flatten_tuple(Dim{}); @@ -1051,29 +1257,29 @@ OPUS_D constexpr auto unfold_x_stride(const Dim&, const Shape&, const Stride& st OPUS_D static constexpr auto p_shape_b() { return p_shape(shape_b(), dim_b()); } \ OPUS_D static constexpr auto p_shape_c() { return p_shape(shape_c(), dim_c()); } \ \ - OPUS_D constexpr auto layout_a() { return make_layout(shape_a());} \ - OPUS_D constexpr auto layout_b() { return make_layout(shape_b());} \ - OPUS_D constexpr auto layout_c() { return make_layout(shape_c());} \ - \ - template OPUS_D constexpr auto layout_a(S&& stride) { return opus::make_layout(shape_a(), unfold_x_stride(dim_a(), shape_a(), stride));} \ - template OPUS_D constexpr auto layout_b(S&& stride) { return opus::make_layout(shape_b(), unfold_x_stride(dim_b(), shape_b(), stride));} \ - template OPUS_D constexpr auto layout_c(S&& stride) { return opus::make_layout(shape_c(), unfold_x_stride(dim_c(), shape_c(), stride));} \ - /* Note, all the coord passed in must be p_coord*/ \ - template OPUS_D constexpr auto layout_a(S&& stride, C&& z) { OPUS_KP_(dim_a); return opus::make_layout(shape_a(), unfold_x_stride(dim_a(), shape_a(), stride), opus::unfold_p_coord(dim_a(), z));} \ - template OPUS_D constexpr auto layout_b(S&& stride, C&& z) { OPUS_KP_(dim_b); return opus::make_layout(shape_b(), unfold_x_stride(dim_b(), shape_b(), stride), opus::unfold_p_coord(dim_b(), z));} \ - template OPUS_D constexpr auto layout_c(S&& stride, C&& z) { OPUS_KP_(dim_c); return opus::make_layout(shape_c(), unfold_x_stride(dim_c(), shape_c(), stride), opus::unfold_p_coord(dim_c(), z));} \ - \ - template OPUS_D constexpr auto layout_a_packed(C&& z) { OPUS_KP_(dim_a); return make_layout_packed(shape_a(), opus::unfold_p_coord(dim_a(), z));} \ - template OPUS_D constexpr auto layout_b_packed(C&& z) { OPUS_KP_(dim_b); return make_layout_packed(shape_b(), opus::unfold_p_coord(dim_b(), z));} \ - template OPUS_D constexpr auto layout_c_packed(C&& z) { OPUS_KP_(dim_c); return make_layout_packed(shape_c(), opus::unfold_p_coord(dim_c(), z));} \ - \ - template && ...), bool> = true> OPUS_D constexpr auto layout_a(Ts&&... strides) {return layout_a(opus::make_tuple(strides...)); } \ - template && ...), bool> = true> OPUS_D constexpr auto layout_b(Ts&&... strides) {return layout_b(opus::make_tuple(strides...)); } \ - template && ...), bool> = true> OPUS_D constexpr auto layout_c(Ts&&... strides) {return layout_c(opus::make_tuple(strides...)); } \ - \ - OPUS_D constexpr auto y_layout_a() { return make_layout(y_shape_a());} \ - OPUS_D constexpr auto y_layout_b() { return make_layout(y_shape_b());} \ - OPUS_D constexpr auto y_layout_c() { return make_layout(y_shape_c());} + template OPUS_D constexpr auto layout_a() { return make_layout(shape_a());} \ + template OPUS_D constexpr auto layout_b() { return make_layout(shape_b());} \ + template OPUS_D constexpr auto layout_c() { return make_layout(shape_c());} \ + \ + template OPUS_D constexpr auto layout_a(S&& stride) { return make_layout(shape_a(), unfold_x_stride(dim_a(), shape_a(), stride));} \ + template OPUS_D constexpr auto layout_b(S&& stride) { return make_layout(shape_b(), unfold_x_stride(dim_b(), shape_b(), stride));} \ + template OPUS_D constexpr auto layout_c(S&& stride) { return make_layout(shape_c(), unfold_x_stride(dim_c(), shape_c(), stride));} \ + /* Note, all the coord passed in must be p_coord*/ \ + template OPUS_D constexpr auto layout_a(S&& stride, C&& z) { OPUS_KP_(dim_a); return make_layout(shape_a(), unfold_x_stride(dim_a(), shape_a(), stride), opus::unfold_p_coord(dim_a(), z));} \ + template OPUS_D constexpr auto layout_b(S&& stride, C&& z) { OPUS_KP_(dim_b); return make_layout(shape_b(), unfold_x_stride(dim_b(), shape_b(), stride), opus::unfold_p_coord(dim_b(), z));} \ + template OPUS_D constexpr auto layout_c(S&& stride, C&& z) { OPUS_KP_(dim_c); return make_layout(shape_c(), unfold_x_stride(dim_c(), shape_c(), stride), opus::unfold_p_coord(dim_c(), z));} \ + \ + template OPUS_D constexpr auto layout_a_packed(C&& z) { OPUS_KP_(dim_a); return make_layout_packed(shape_a(), opus::unfold_p_coord(dim_a(), z));} \ + template OPUS_D constexpr auto layout_b_packed(C&& z) { OPUS_KP_(dim_b); return make_layout_packed(shape_b(), opus::unfold_p_coord(dim_b(), z));} \ + template OPUS_D constexpr auto layout_c_packed(C&& z) { OPUS_KP_(dim_c); return make_layout_packed(shape_c(), opus::unfold_p_coord(dim_c(), z));} \ + \ + template && ...), bool> = true> OPUS_D constexpr auto layout_a(Ts&&... strides) {return layout_a(opus::make_tuple(strides...)); } \ + template && ...), bool> = true> OPUS_D constexpr auto layout_b(Ts&&... strides) {return layout_b(opus::make_tuple(strides...)); } \ + template && ...), bool> = true> OPUS_D constexpr auto layout_c(Ts&&... strides) {return layout_c(opus::make_tuple(strides...)); } \ + \ + template OPUS_D constexpr auto y_layout_a() { return make_layout(y_shape_a());} \ + template OPUS_D constexpr auto y_layout_b() { return make_layout(y_shape_b());} \ + template OPUS_D constexpr auto y_layout_c() { return make_layout(y_shape_c());} // Note: any class to support adaptor need include OPUS_ADAPTOR_LAYOUT_API_DEFINE and implement shape_a()/shape_b()/shape_c() // P indicates dim cross thread, Y indicates dim within thread, this is X layout (X=P+Y) view the tensor as a whole @@ -1262,25 +1468,25 @@ OPUS_D decltype(auto) make_tiled_mma(ES, TS, WS, WA&& = {}, TA&& = {}) { } ///////////////////////////////////////////////////////////////////////////////////////////////////////// -// partition -template OPUS_D constexpr auto partition_layout_a(M&& mma) { return mma.layout_a(); } -template OPUS_D constexpr auto partition_layout_b(M&& mma) { return mma.layout_b(); } -template OPUS_D constexpr auto partition_layout_c(M&& mma) { return mma.layout_c(); } - -template, bool> = true> OPUS_D constexpr auto partition_layout_a(M&& mma, S&& x_stride) { return mma.layout_a(std::forward(x_stride)); } -template, bool> = true> OPUS_D constexpr auto partition_layout_b(M&& mma, S&& x_stride) { return mma.layout_b(std::forward(x_stride)); } -template, bool> = true> OPUS_D constexpr auto partition_layout_c(M&& mma, S&& x_stride) { return mma.layout_c(std::forward(x_stride)); } - -template && is_tuple_v, bool> = true> -OPUS_D constexpr auto partition_layout_a(M&& mma, S&& x_stride, C&& p_coord) { return mma.layout_a(std::forward(x_stride), std::forward(p_coord)); } -template && is_tuple_v, bool> = true> -OPUS_D constexpr auto partition_layout_b(M&& mma, S&& x_stride, C&& p_coord) { return mma.layout_b(std::forward(x_stride), std::forward(p_coord)); } -template && is_tuple_v, bool> = true> -OPUS_D constexpr auto partition_layout_c(M&& mma, S&& x_stride, C&& p_coord) { return mma.layout_c(std::forward(x_stride), std::forward(p_coord)); } - -template, bool> = true> OPUS_D constexpr auto partition_layout_a_packed(M&& mma, C&& p_coord) { return mma.layout_a_packed(std::forward(p_coord)); } -template, bool> = true> OPUS_D constexpr auto partition_layout_b_packed(M&& mma, C&& p_coord) { return mma.layout_b_packed(std::forward(p_coord)); } -template, bool> = true> OPUS_D constexpr auto partition_layout_c_packed(M&& mma, C&& p_coord) { return mma.layout_c_packed(std::forward(p_coord)); } +// partition, use cached_vec to dispatch which layout implementation. cached_vec < 0 : "layout", cached_vec == 0 : "layout_linear", cached_vec > 0 : "layout_cached" +template OPUS_D constexpr auto partition_layout_a(M&& mma) { return mma.template layout_a(); } +template OPUS_D constexpr auto partition_layout_b(M&& mma) { return mma.template layout_b(); } +template OPUS_D constexpr auto partition_layout_c(M&& mma) { return mma.template layout_c(); } + +template, bool> = true> OPUS_D constexpr auto partition_layout_a(M&& mma, S&& x_stride) { return mma.template layout_a(std::forward(x_stride)); } +template, bool> = true> OPUS_D constexpr auto partition_layout_b(M&& mma, S&& x_stride) { return mma.template layout_b(std::forward(x_stride)); } +template, bool> = true> OPUS_D constexpr auto partition_layout_c(M&& mma, S&& x_stride) { return mma.template layout_c(std::forward(x_stride)); } + +template && is_tuple_v, bool> = true> +OPUS_D constexpr auto partition_layout_a(M&& mma, S&& x_stride, C&& p_coord) { return mma.template layout_a(std::forward(x_stride), std::forward(p_coord)); } +template && is_tuple_v, bool> = true> +OPUS_D constexpr auto partition_layout_b(M&& mma, S&& x_stride, C&& p_coord) { return mma.template layout_b(std::forward(x_stride), std::forward(p_coord)); } +template && is_tuple_v, bool> = true> +OPUS_D constexpr auto partition_layout_c(M&& mma, S&& x_stride, C&& p_coord) { return mma.template layout_c(std::forward(x_stride), std::forward(p_coord)); } + +template, bool> = true> OPUS_D constexpr auto partition_layout_a_packed(M&& mma, C&& p_coord) { return mma.template layout_a_packed(std::forward(p_coord)); } +template, bool> = true> OPUS_D constexpr auto partition_layout_b_packed(M&& mma, C&& p_coord) { return mma.template layout_b_packed(std::forward(p_coord)); } +template, bool> = true> OPUS_D constexpr auto partition_layout_c_packed(M&& mma, C&& p_coord) { return mma.template layout_c_packed(std::forward(p_coord)); } #undef OPUS_KP_ // clang-format on } // namespace