diff --git a/backends/vulkan/runtime/api/gen_vulkan_spv.py b/backends/vulkan/runtime/api/gen_vulkan_spv.py index 5f1e579a14f..89f6353944b 100644 --- a/backends/vulkan/runtime/api/gen_vulkan_spv.py +++ b/backends/vulkan/runtime/api/gen_vulkan_spv.py @@ -34,22 +34,13 @@ CPP_H_NAME = "spv.h" CPP_SRC_NAME = "spv.cpp" +# Basic configuration settings for shaders DEFAULT_ENV: Dict[str, Any] = { "PRECISION": "highp", - "FLOAT_IMAGE_FORMAT": "rgba16f", - "INT_IMAGE_FORMAT": "rgba32i", - "UINT_IMAGE_FORMAT": "rgba32ui", } -TYPES_ENV: Dict[str, Any] = { - "IMAGE_FORMAT": { - "float": "rgba32f", - "half": "rgba16f", - "int": "rgba32i", - "uint": "rgba32ui", - "int8": "rgba8i", - "uint8": "rgba8ui", - }, +# Establishes relationships between different tensor types and different GLSL types +TYPE_MAPPINGS: Dict[str, Any] = { "IMAGE_T": { 3: { "float": "image3D", @@ -78,29 +69,74 @@ "uint": "usampler2D", }, }, - "VEC4_T": { - "float": "vec4", - "half": "vec4", - "int": "ivec4", - "uint": "uvec4", - "int8": "vec4", - "uint8": "uvec4", - }, - "T": { - "float": "float", - "half": "float", - "int": "int", - "uint": "uint", - "int8": "int", - "uint8": "uint8", + "IMAGE_FORMAT": { + "float": "rgba32f", + "half": "rgba16f", + "int": "rgba32i", + "uint": "rgba32ui", + "int8": "rgba8i", + "uint8": "rgba8ui", }, } -FUNCS_ENV: Dict[str, Any] = { - "GET_POS": { + +def define_variable(name: str) -> str: + if name in locals(): + return f"#define {name} {locals()[name]}" + elif name in globals(): + return f"#define {name} {globals()[name]}" + else: + raise RuntimeError(f"{name} is not defined") + + +def get_buffer_scalar_type(dtype: str) -> str: + # TODO(ssjia): use float16_t for half types + if dtype == "half": + return "float" + # TODO(ssjia): use int8_t for int8 types + elif dtype[-1] == "8": + return dtype[:-1] + + return dtype + + +def get_texel_type(dtype: str) -> str: + image_format = TYPE_MAPPINGS["IMAGE_FORMAT"][dtype] + if image_format[-1] == "f": + return "vec4" + elif image_format[-2] == "ui": + return "uvec4" + elif image_format[-1] == "i": + return "ivec4" + raise AssertionError(f"Invalid image format: {image_format}") + + +def get_gvec_type(dtype: str, n: int) -> str: + gvec4_type = get_texel_type(dtype) + return gvec4_type[:-1] + str(n) + + +def get_texel_component_type(dtype: str) -> str: + vec4_type = get_texel_type(dtype) + if vec4_type[:3] == "vec": + return "float" + elif vec4_type[:4] == "ivec": + return "int" + elif vec4_type[:4] == "uvec": + return "uint" + raise AssertionError(f"Invalid vec4 type: {vec4_type}") + + +UTILITY_FNS: Dict[str, Any] = { + "macro_define": define_variable, + "get_pos": { 3: lambda pos: pos, 2: lambda pos: f"{pos}.xy", - } + }, + "buffer_scalar_type": get_buffer_scalar_type, + "texel_type": get_texel_type, + "gvec_type": get_gvec_type, + "texel_component_type": get_texel_component_type, } @@ -376,26 +412,6 @@ def create_shader_params( for key, value in variant_params.items(): shader_params[key] = value - shader_dtype = shader_params.get("DTYPE", "float") - - if shader_dtype == "int": - shader_params["FORMAT"] = self.env["INT_IMAGE_FORMAT"] - elif shader_dtype == "uint": - shader_params["FORMAT"] = self.env["UINT_IMAGE_FORMAT"] - elif shader_dtype == "int32": - shader_params["FORMAT"] = "rgba32i" - elif shader_dtype == "uint32": - shader_params["FORMAT"] = "rgba32ui" - elif shader_dtype == "int8": - shader_params["FORMAT"] = "rgba8i" - elif shader_dtype == "uint8": - shader_params["FORMAT"] = "rgba8ui" - elif shader_dtype == "float32": - shader_params["FORMAT"] = "rgba32f" - # Assume float by default - else: - shader_params["FORMAT"] = self.env["FLOAT_IMAGE_FORMAT"] - return shader_params def constructOutputMap(self) -> None: @@ -732,9 +748,9 @@ def main(argv: List[str]) -> int: ) options = parser.parse_args() - DEFAULT_ENV.update(TYPES_ENV) - DEFAULT_ENV.update(FUNCS_ENV) env = DEFAULT_ENV + env.update(TYPE_MAPPINGS) + env.update(UTILITY_FNS) for key, value in parse_arg_env(options.env).items(): env[key] = value diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl index c648db2c4c2..5a64cf78031 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl @@ -8,12 +8,17 @@ #version 450 core -#include "broadcasting_utils.h" -#include "indexing_utils.h" - #define PRECISION ${PRECISION} -#define OP(X, Y, A) ${OPERATOR} +#define VEC4_T ${texel_type(DTYPE)} + +#define to_tensor_idx to_tensor_idx_${PACKING} +#define to_texture_pos to_texture_pos_${PACKING} + +#define op(X, Y, A) ${OPERATOR} + +#include "broadcasting_utils.h" +#include "indexing_utils.h" layout(std430) buffer; @@ -50,22 +55,22 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 coord = POS_TO_COORD_${PACKING}(pos, out_sizes.data); + const ivec4 idx = to_tensor_idx(pos, out_sizes.data); - if (any(greaterThanEqual(coord, out_sizes.data))) { + if (any(greaterThanEqual(idx, out_sizes.data))) { return; } - ivec4 in_coord = out_coord_to_in_coord(coord, in_sizes.data); - ${VEC4_T[DTYPE]} in_texel = ${VEC4_T[DTYPE]}(texelFetch( + ivec4 in_idx = broadcast_indices(idx, in_sizes.data); + VEC4_T in_texel = VEC4_T(texelFetch( image_in, - COORD_TO_POS_${PACKING}(in_coord, in_sizes.data), + to_texture_pos(in_idx, in_sizes.data), 0)); - ivec4 other_coord = out_coord_to_in_coord(coord, other_sizes.data); - ${VEC4_T[DTYPE]} other_texel = ${VEC4_T[DTYPE]}(texelFetch( + ivec4 other_idx = broadcast_indices(idx, other_sizes.data); + VEC4_T other_texel = VEC4_T(texelFetch( image_other, - COORD_TO_POS_${PACKING}(other_coord, other_sizes.data), + to_texture_pos(other_idx, other_sizes.data), 0)); // Check boolean broadcast flags; we use ivec2 instead of bvec2 for alignment. @@ -76,5 +81,5 @@ void main() { other_texel = other_texel.xxxx; } - imageStore(image_out, pos, ${VEC4_T[DTYPE]}(OP(in_texel, other_texel, alpha.data))); + imageStore(image_out, pos, VEC4_T(op(in_texel, other_texel, alpha.data))); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml index 28f65ee29c7..a8ef9c1d960 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml @@ -9,22 +9,16 @@ binary_op: OPERATOR: X + A * Y NDIM: 3 DTYPE: float - PACKING: CHANNELS_PACKED + PACKING: C_packed generate_variant_forall: PACKING: - - VALUE: CHANNELS_PACKED - SUFFIX: C_packed - - VALUE: WIDTH_PACKED - SUFFIX: W_packed - - VALUE: HEIGHT_PACKED - SUFFIX: H_packed + - VALUE: C_packed + - VALUE: W_packed + - VALUE: H_packed DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float - VALUE: int - SUFFIX: int shader_variants: - NAME: binary_add - NAME: binary_sub diff --git a/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h b/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h index 55fd8b8e482..840e98a25ed 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h +++ b/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h @@ -6,12 +6,12 @@ * LICENSE file in the root directory of this source tree. */ -ivec4 out_coord_to_in_coord(const ivec4 out_coord, const ivec4 in_sizes) { - ivec4 in_coord = out_coord; +ivec4 broadcast_indices(const ivec4 out_idx, const ivec4 in_sizes) { + ivec4 in_idx = out_idx; for (int i = 0; i < 4; ++i) { - if (out_coord[i] >= in_sizes[i]) { - in_coord[i] = 0; + if (out_idx[i] >= in_sizes[i]) { + in_idx[i] = 0; } } - return in_coord; + return in_idx; } diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl index 30051e5f5a3..c3ede99cf4e 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl @@ -10,6 +10,8 @@ #define PRECISION ${PRECISION} +#define VEC4_T ${texel_type(DTYPE)} + #include "indexing_utils.h" layout(std430) buffer; @@ -78,12 +80,12 @@ void main() { kstart.y += pos.z * params.kernel_size.y; // Perform the convolution by iterating over the overlay region. - ${VEC4_T[DTYPE]} sum = texelFetch(bias_in, ivec2(pos.z, 0), 0); + VEC4_T sum = texelFetch(bias_in, ivec2(pos.z, 0), 0); const int ic4 = extra_params.in_group_size / 4; for (int z4 = 0; z4 < ic4; ++z4, kstart.x += params.kernel_size.x * 4) { for (int y = start.y, ky = kstart.y; y < end.y; y += params.dilation.y, ++ky) { for (int x = start.x, kx = kstart.x; x < end.x; x += params.dilation.x, kx += 4) { - const ${VEC4_T[DTYPE]} in_texel = texelFetch(image_in, ivec3(x, y, z4), 0); + const VEC4_T in_texel = texelFetch(image_in, ivec3(x, y, z4), 0); const ivec4 kxs = kx + ivec4(0, 1, 2, 3); // To explain the calculation below, the contents of in_texel and the diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d.yaml index 6764a2daa75..882737b6f19 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d.yaml @@ -11,8 +11,6 @@ conv2d: generate_variant_forall: DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float shader_variants: - NAME: conv2d diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl index 50b60ad956d..de81c7cbdde 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl @@ -10,6 +10,8 @@ #define PRECISION ${PRECISION} +#define VEC4_T ${texel_type(DTYPE)} + #include "indexing_utils.h" layout(std430) buffer; @@ -66,14 +68,14 @@ void main() { const ivec2 start = ipos; const ivec2 end = ipos + extra_params.overlay_region.xy; - ${VEC4_T[DTYPE]} sum = texelFetch(bias_in, ivec2(pos.z, 0), 0); + VEC4_T sum = texelFetch(bias_in, ivec2(pos.z, 0), 0); int kx = 0; for (int y = start.y; y < end.y; y += params.dilation.y) { for (int x = start.x; x < end.x; x += params.dilation.x) { // The weight kernel was rearranged such that every NxN filter is // flattened to fit in one row. Each filter was then stacked on top of // each other vertically. - const ${VEC4_T[DTYPE]} in_texel = texelFetch(image_in, ivec3(x, y, pos.z), 0); + const VEC4_T in_texel = texelFetch(image_in, ivec3(x, y, pos.z), 0); sum = fma(in_texel, texelFetch(kernel_in, ivec2(kx, pos.z), 0), sum); ++kx; } diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.yaml index 560887f3dc1..31c9778b2aa 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.yaml @@ -11,8 +11,6 @@ conv2d_dw: generate_variant_forall: DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float shader_variants: - NAME: conv2d_dw diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl index 470eef6cdeb..a514137db39 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl @@ -10,6 +10,8 @@ #define PRECISION ${PRECISION} +#define VEC4_T ${texel_type(DTYPE)} + #include "indexing_utils.h" layout(std430) buffer; @@ -66,7 +68,7 @@ void main() { const ivec2 start = ipos; const ivec2 end = ipos + extra_params.overlay_region.xy; - ${VEC4_T[DTYPE]} sum = texelFetch(bias_in, ivec2(pos.z, 0), 0); + VEC4_T sum = texelFetch(bias_in, ivec2(pos.z, 0), 0); int kx = 0; for (int y = start.y, i = 0; i < ${TILE_SIZE}; y += params.dilation.y, i++) { for (int x = start.x, j = 0; j < ${TILE_SIZE}; x += params.dilation.x, j++) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml index 1d4405e0276..b9346abdd9d 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml @@ -12,9 +12,7 @@ conv2d_dw_output_tile: generate_variant_forall: DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float shader_variants: - NAME: conv2d_dw_output_tile_3x3 - NAME: conv2d_dw_output_tile_5x5 diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl index ef2b54ba354..267bf5c1fe9 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl @@ -10,13 +10,20 @@ #define PRECISION ${PRECISION} +#define BUF_T ${buffer_scalar_type(DTYPE)} +#define VEC4_T ${texel_type(DTYPE)} +#define SCALAR_T ${texel_component_type(DTYPE)} + +#define to_tensor_idx to_tensor_idx_${PACKING} +#define get_packed_stride get_packed_stride_${PACKING} + #include "indexing_utils.h" layout(std430) buffer; layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out; layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { - ${T[DTYPE]} data[]; + BUF_T data[]; } buffer_in; @@ -67,17 +74,17 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; */ void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 coord = POS_TO_COORD_CHANNELS_PACKED(pos, gpu_sizes.data); + const ivec4 idx = to_tensor_idx(pos, gpu_sizes.data); - if (any(greaterThanEqual(coord, gpu_sizes.data))) { + if (any(greaterThanEqual(idx, gpu_sizes.data))) { return; } // As in usual staging shaders, map from GPU texel position to normal CPU // buffer indices: (9,3) -> (4,3,9) - const int base_index = COORD_TO_BUFFER_IDX(coord, gpu_sizes.data); + const int base_index = to_buffer_i(idx, gpu_sizes.data); const ivec4 p0 = - base_index + ivec4(0, 1, 2, 3) * STRIDE_CHANNELS_PACKED(gpu_sizes.data); + base_index + ivec4(0, 1, 2, 3) * get_packed_stride(gpu_sizes.data); // Re-map the normal CPU buffer indices to special indices, through a series // of mappings: reshape is a no-op to the underlying indices, so we only map @@ -89,19 +96,19 @@ void main() { const int W = original_sizes.data.x; // Undo step 3 permute: (4,3,1,9) -> (3,4,1,9) - const ivec4 p1 = SWAP_ADJ_DIMS(p0, 4, (Np / 4), (C * H * W)); + const ivec4 p1 = swap_adj_dims(p0, 4, (Np / 4), (C * H * W)); // Undo step 1 pad: (12,1,3,3) -> (11,1,3,3) // For values in the padded region, write zero instead of buffer data. const ivec4 n = p1 / (C * H * W); const ivec4 mask = ivec4(greaterThanEqual(n, ivec4(N))); - ${T[DTYPE]} val_x = mix(buffer_in.data[p1.x], 0, mask.x); - ${T[DTYPE]} val_y = mix(buffer_in.data[p1.y], 0, mask.y); - ${T[DTYPE]} val_z = mix(buffer_in.data[p1.z], 0, mask.z); - ${T[DTYPE]} val_w = mix(buffer_in.data[p1.w], 0, mask.w); + SCALAR_T val_x = mix(SCALAR_T(buffer_in.data[p1.x]), 0, mask.x); + SCALAR_T val_y = mix(SCALAR_T(buffer_in.data[p1.y]), 0, mask.y); + SCALAR_T val_z = mix(SCALAR_T(buffer_in.data[p1.z]), 0, mask.z); + SCALAR_T val_w = mix(SCALAR_T(buffer_in.data[p1.w]), 0, mask.w); - ${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(val_x, val_y, val_z, val_w); + VEC4_T texel = VEC4_T(val_x, val_y, val_z, val_w); imageStore(image_out, pos.xy, texel); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.yaml index e7fc5f797c8..e8b29a71b9b 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.yaml @@ -7,11 +7,10 @@ conv2d_dw_prepack_weights: parameter_names_with_default_values: DTYPE: float + PACKING: C_packed generate_variant_forall: DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float shader_variants: - NAME: conv2d_dw_prepack_weights diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl index 26b4fa0d76f..fc1405f6439 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl @@ -10,13 +10,20 @@ #define PRECISION ${PRECISION} +#define BUF_T ${buffer_scalar_type(DTYPE)} +#define VEC4_T ${texel_type(DTYPE)} +#define SCALAR_T ${texel_component_type(DTYPE)} + +#define to_tensor_idx to_tensor_idx_${PACKING} +#define get_packed_stride get_packed_stride_${PACKING} + #include "indexing_utils.h" layout(std430) buffer; layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out; layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { - ${T[DTYPE]} data[]; + BUF_T data[]; } buffer_in; @@ -81,17 +88,17 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; */ void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 coord = POS_TO_COORD_CHANNELS_PACKED(pos, gpu_sizes.data); + const ivec4 idx = to_tensor_idx(pos, gpu_sizes.data); - if (any(greaterThanEqual(coord, gpu_sizes.data))) { + if (any(greaterThanEqual(idx, gpu_sizes.data))) { return; } // As in usual staging shaders, map from GPU texel position to normal CPU // buffer indices: (24,9) -> (4,9,24) - const int base_index = COORD_TO_BUFFER_IDX(coord, gpu_sizes.data); + const int base_index = to_buffer_i(idx, gpu_sizes.data); const ivec4 p0 = - base_index + ivec4(0, 1, 2, 3) * STRIDE_CHANNELS_PACKED(gpu_sizes.data); + base_index + ivec4(0, 1, 2, 3) * get_packed_stride(gpu_sizes.data); // Re-map the normal CPU buffer indices to special indices, through a series // of mappings: reshape is a no-op to the underlying indices, so we only map @@ -107,10 +114,10 @@ void main() { // Undo step 4 permute: (12,3,2,12) -> (12,2,3,12) // Undo step 3 permute, part 1: (12,2,3h,3w,4) -> (12,2,3h,4,3w) // Undo step 3 permute, part 2: (12,2,3h,4,3w) -> (12,2,4,3h,3w) - const ivec4 p1 = SWAP_ADJ_DIMS(p0, 4, (Np / 4), (H * Cp * W)); - const ivec4 p2 = SWAP_ADJ_DIMS(p1, H, (Cp / 4), (W * 4)); - const ivec4 p3 = SWAP_ADJ_DIMS(p2, W, 4, 1); - const ivec4 p4 = SWAP_ADJ_DIMS(p3, H, 4, W); + const ivec4 p1 = swap_adj_dims(p0, 4, (Np / 4), (H * Cp * W)); + const ivec4 p2 = swap_adj_dims(p1, H, (Cp / 4), (W * 4)); + const ivec4 p3 = swap_adj_dims(p2, W, 4, 1); + const ivec4 p4 = swap_adj_dims(p3, H, 4, W); // Undo step 1 pad: (12,8,3,3) -> (10,7,3,3) // For values in the padded region, write zero instead of buffer data. @@ -120,12 +127,12 @@ void main() { const ivec4 mask = ivec4(greaterThanEqual(c, ivec4(C))) | ivec4(greaterThanEqual(n, ivec4(N))); - ${T[DTYPE]} val_x = mix(buffer_in.data[p5.x], 0, mask.x); - ${T[DTYPE]} val_y = mix(buffer_in.data[p5.y], 0, mask.y); - ${T[DTYPE]} val_z = mix(buffer_in.data[p5.z], 0, mask.z); - ${T[DTYPE]} val_w = mix(buffer_in.data[p5.w], 0, mask.w); + SCALAR_T val_x = mix(SCALAR_T(buffer_in.data[p5.x]), 0, mask.x); + SCALAR_T val_y = mix(SCALAR_T(buffer_in.data[p5.y]), 0, mask.y); + SCALAR_T val_z = mix(SCALAR_T(buffer_in.data[p5.z]), 0, mask.z); + SCALAR_T val_w = mix(SCALAR_T(buffer_in.data[p5.w]), 0, mask.w); - ${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(val_x, val_y, val_z, val_w); + VEC4_T texel = VEC4_T(val_x, val_y, val_z, val_w); imageStore(image_out, pos.xy, texel); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.yaml index 277df2619ff..355c518555d 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.yaml @@ -7,11 +7,10 @@ conv2d_prepack_weights: parameter_names_with_default_values: DTYPE: float + PACKING: C_packed generate_variant_forall: DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float shader_variants: - NAME: conv2d_prepack_weights diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl index 2ae4f972754..6a4b8fcb288 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl @@ -10,6 +10,8 @@ #define PRECISION ${PRECISION} +#define VEC4_T ${texel_type(DTYPE)} + #include "indexing_utils.h" layout(std430) buffer; diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml index a6b03452e23..2e04b6a3991 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml @@ -12,8 +12,6 @@ conv2d_pw: generate_variant_forall: DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float shader_variants: - NAME: conv2d_pw diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl index cb4d21e172a..60c7043fd9d 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl @@ -10,6 +10,8 @@ #define PRECISION ${PRECISION} +#define VEC4_T ${texel_type(DTYPE)} + layout(std430) buffer; layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; @@ -73,11 +75,11 @@ void main() { int kx_start = (extra_params.overlay_region.x - 1 - (ipos.x - params.stride.x * start.x)) * ic; - ${VEC4_T[DTYPE]} sum = texelFetch(bias_in, ivec2(pos.z, 0), 0); + VEC4_T sum = texelFetch(bias_in, ivec2(pos.z, 0), 0); for (int y = start.y, ky = ky_start; y < end.y; ++y, ky += params.stride.y) { for (int x = start.x, kx = kx_start; x < end.x; ++x, kx += kx_stride) { for (int z4 = 0; z4 < ic / 4; ++z4, kx += 4) { - const ${VEC4_T[DTYPE]} in_texel = texelFetch(image_in, ivec3(x, y, z4), 0); + const VEC4_T in_texel = texelFetch(image_in, ivec3(x, y, z4), 0); const ivec4 kxs = kx + ivec4(0, 1, 2, 3); sum = fma(in_texel.xxxx, texelFetch(kernel_in, ivec2(kxs.x, ky), 0), sum); diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml index ab2c82a901e..7fc40c3242e 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml @@ -11,8 +11,6 @@ conv_transpose2d: generate_variant_forall: DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float shader_variants: - NAME: conv_transpose2d diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl index 22bdbb506f7..6d81197531f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl @@ -10,13 +10,20 @@ #define PRECISION ${PRECISION} +#define BUF_T ${buffer_scalar_type(DTYPE)} +#define VEC4_T ${texel_type(DTYPE)} +#define SCALAR_T ${texel_component_type(DTYPE)} + +#define to_tensor_idx to_tensor_idx_${PACKING} +#define get_packed_stride get_packed_stride_${PACKING} + #include "indexing_utils.h" layout(std430) buffer; layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out; layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { - ${T[DTYPE]} data[]; + BUF_T data[]; } buffer_in; @@ -53,17 +60,17 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; */ void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 coord = POS_TO_COORD_CHANNELS_PACKED(pos, gpu_sizes.data); + const ivec4 idx = to_tensor_idx(pos, gpu_sizes.data); - if (any(greaterThanEqual(coord, gpu_sizes.data))) { + if (any(greaterThanEqual(idx, gpu_sizes.data))) { return; } // As in usual staging shaders, map from GPU texel position to normal CPU // buffer indices: (36,6) -> (4,6,36) - const int base_index = COORD_TO_BUFFER_IDX(coord, gpu_sizes.data); + const int base_index = to_buffer_i(idx, gpu_sizes.data); const ivec4 p0 = - base_index + ivec4(0, 1, 2, 3) * STRIDE_CHANNELS_PACKED(gpu_sizes.data); + base_index + ivec4(0, 1, 2, 3) * get_packed_stride(gpu_sizes.data); // Re-map the normal CPU buffer indices to special indices, through a series // of mappings: reshape is a no-op to the underlying indices, so we only map @@ -81,14 +88,14 @@ void main() { // Undo step 3 permute, part 2: (8,a,c,b,4) -> (8,c,a,b,4) // Undo step 3 permute, part 3: (8,c,a,b,4) -> (8,c,a,4,b) // Undo step 3 permute, part 4: (8,c,a,4,b) -> (8,c,4,a,b) - const ivec4 p1 = SWAP_ADJ_DIMS(p0, 4, (Cp / 4), (H * Np * W)); - const ivec4 p2 = SWAP_ADJ_DIMS(p1, W, (Np / 4), 4); - const ivec4 p3 = SWAP_ADJ_DIMS(p2, H, (Np / 4), (W * 4)); - const ivec4 p4 = SWAP_ADJ_DIMS(p3, W, 4, 1); - const ivec4 p5 = SWAP_ADJ_DIMS(p4, H, 4, W); + const ivec4 p1 = swap_adj_dims(p0, 4, (Cp / 4), (H * Np * W)); + const ivec4 p2 = swap_adj_dims(p1, W, (Np / 4), 4); + const ivec4 p3 = swap_adj_dims(p2, H, (Np / 4), (W * 4)); + const ivec4 p4 = swap_adj_dims(p3, W, 4, 1); + const ivec4 p5 = swap_adj_dims(p4, H, 4, W); // Undo step 0 permute: (8,12,3,3) -> (12,8,3,3) - const ivec4 p6 = SWAP_ADJ_DIMS(p5, Cp, Np, (W * H)); + const ivec4 p6 = swap_adj_dims(p5, Cp, Np, (W * H)); // Undo step 0 flip: (2,3) const ivec4 w = p6 % W; const ivec4 h = p6 % (H * W) / W; @@ -102,12 +109,12 @@ void main() { const ivec4 mask = ivec4(greaterThanEqual(c, ivec4(C))) | ivec4(greaterThanEqual(n, ivec4(N))); - ${T[DTYPE]} val_x = mix(buffer_in.data[p8.x], 0, mask.x); - ${T[DTYPE]} val_y = mix(buffer_in.data[p8.y], 0, mask.y); - ${T[DTYPE]} val_z = mix(buffer_in.data[p8.z], 0, mask.z); - ${T[DTYPE]} val_w = mix(buffer_in.data[p8.w], 0, mask.w); + SCALAR_T val_x = mix(SCALAR_T(buffer_in.data[p8.x]), 0, mask.x); + SCALAR_T val_y = mix(SCALAR_T(buffer_in.data[p8.y]), 0, mask.y); + SCALAR_T val_z = mix(SCALAR_T(buffer_in.data[p8.z]), 0, mask.z); + SCALAR_T val_w = mix(SCALAR_T(buffer_in.data[p8.w]), 0, mask.w); - ${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(val_x, val_y, val_z, val_w); + VEC4_T texel = VEC4_T(val_x, val_y, val_z, val_w); imageStore(image_out, pos.xy, texel); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml index a6cae5c6a15..0e006ff5069 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml @@ -8,11 +8,10 @@ conv_transpose2d_prepack_weights: parameter_names_with_default_values: NDIM: 3 DTYPE: float + PACKING: C_packed generate_variant_forall: DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float shader_variants: - NAME: conv_transpose2d_prepack_weights diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl index c353908c416..c357d0487e5 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl @@ -10,13 +10,20 @@ #define PRECISION ${PRECISION} +#define BUF_T ${buffer_scalar_type(DTYPE)} +#define VEC4_T ${texel_type(DTYPE)} + +#define to_tensor_idx to_tensor_idx_${PACKING} +#define get_packed_dim get_packed_dim_${PACKING} +#define get_packed_stride get_packed_stride_${PACKING} + #include "indexing_utils.h" layout(std430) buffer; layout(set = 0, binding = 0) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} image_in; layout(set = 0, binding = 1) buffer PRECISION restrict writeonly Buffer { - ${T[DTYPE]} data[]; + BUF_T data[]; } buffer_out; @@ -34,31 +41,31 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 coord = POS_TO_COORD_${PACKING}(pos, gpu_sizes.data); + const ivec4 idx = to_tensor_idx(pos, gpu_sizes.data); - if (any(greaterThanEqual(coord, gpu_sizes.data))) { + if (any(greaterThanEqual(idx, gpu_sizes.data))) { return; } - const ${VEC4_T[DTYPE]} intex = texelFetch(image_in, ${GET_POS[NDIM]("pos")}, 0); + const VEC4_T intex = texelFetch(image_in, ${get_pos[NDIM]("pos")}, 0); - const int base_index = COORD_TO_BUFFER_IDX(coord, cpu_sizes.data); + const int base_index = to_buffer_i(idx, cpu_sizes.data); const ivec4 buf_indices = - base_index + ivec4(0, 1, 2, 3) * STRIDE_${PACKING}(cpu_sizes.data); + base_index + ivec4(0, 1, 2, 3) * get_packed_stride(cpu_sizes.data); - const int packed_dim_size = PACKED_DIM_${PACKING}(cpu_sizes.data); - int packed_coord = PACKED_DIM_${PACKING}(coord); + const int packed_dim_size = get_packed_dim(cpu_sizes.data); + int packed_idx = get_packed_dim(idx); - if (packed_coord < packed_dim_size) { - buffer_out.data[buf_indices.x] = intex.x; + if (packed_idx < packed_dim_size) { + buffer_out.data[buf_indices.x] = BUF_T(intex.x); } - if (packed_coord + 1 < packed_dim_size) { - buffer_out.data[buf_indices.y] = intex.y; + if (packed_idx + 1 < packed_dim_size) { + buffer_out.data[buf_indices.y] = BUF_T(intex.y); } - if (packed_coord + 2 < packed_dim_size) { - buffer_out.data[buf_indices.z] = intex.z; + if (packed_idx + 2 < packed_dim_size) { + buffer_out.data[buf_indices.z] = BUF_T(intex.z); } - if (packed_coord + 3 < packed_dim_size) { - buffer_out.data[buf_indices.w] = intex.w; + if (packed_idx + 3 < packed_dim_size) { + buffer_out.data[buf_indices.w] = BUF_T(intex.w); } } diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml index 4683f51ac60..b1cc531b250 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml @@ -11,19 +11,13 @@ image_to_nchw: PACKING: CHANNELS_PACKED generate_variant_forall: PACKING: - - VALUE: CHANNELS_PACKED - SUFFIX: C_packed - - VALUE: WIDTH_PACKED - SUFFIX: W_packed - - VALUE: HEIGHT_PACKED - SUFFIX: H_packed + - VALUE: C_packed + - VALUE: W_packed + - VALUE: H_packed DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float - VALUE: int - SUFFIX: int shader_variants: - NAME: image3d_to_nchw - NAME: image2d_to_nchw diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h index 25a80602034..b3195ee7511 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h @@ -6,51 +6,44 @@ * LICENSE file in the root directory of this source tree. */ -#define DIVUP4(x) ((x + 3) / 4) +#define divup4(x) ((x + 3) / 4) -#define PACKED_DIM_CHANNELS_PACKED(vec) vec.z +#define to_buffer_i(idx, sizes) \ + idx.x + idx.y* sizes.x + idx.z* sizes.y* sizes.x + \ + idx.w* sizes.z* sizes.y* sizes.x; -#define PACKED_DIM_WIDTH_PACKED(vec) vec.x +#define get_packed_dim_C_packed(vec) vec.z +#define get_packed_dim_W_packed(vec) vec.x +#define get_packed_dim_H_packed(vec) vec.y -#define PACKED_DIM_HEIGHT_PACKED(vec) vec.y +#define get_packed_stride_C_packed(vec) (vec.x * vec.y) +#define get_packed_stride_W_packed(vec) (1) +#define get_packed_stride_H_packed(vec) (vec.x) -#define POS_TO_COORD_CHANNELS_PACKED(pos, sizes) \ +#define to_tensor_idx_C_packed(pos, sizes) \ ivec4(pos.x, pos.y, (pos.z * 4) % sizes.z, (pos.z * 4) / sizes.z) -#define POS_TO_COORD_WIDTH_PACKED(pos, sizes) \ +#define to_tensor_idx_W_packed(pos, sizes) \ ivec4((pos.x * 4), pos.y, pos.z % sizes.z, pos.z / sizes.z) -#define POS_TO_COORD_HEIGHT_PACKED(pos, sizes) \ +#define to_tensor_idx_H_packed(pos, sizes) \ ivec4(pos.x, (pos.y * 4), pos.z % sizes.z, pos.z / sizes.z) -#define COORD_TO_POS_CHANNELS_PACKED(coord, sizes) \ - ivec3(coord.x, coord.y, (coord.z + coord.w * sizes.z) / 4) +#define to_texture_pos_C_packed(idx, sizes) \ + ivec3(idx.x, idx.y, (idx.z + idx.w * sizes.z) / 4) -#define COORD_TO_POS_WIDTH_PACKED(coord, sizes) \ - ivec3(coord.x / 4, coord.y, (coord.z + coord.w * sizes.z)) +#define to_texture_pos_W_packed(idx, sizes) \ + ivec3(idx.x / 4, idx.y, (idx.z + idx.w * sizes.z)) -#define COORD_TO_POS_HEIGHT_PACKED(coord, sizes) \ - ivec3(coord.x, coord.y / 4, (coord.z + coord.w * sizes.z)) - -#define COORD_TO_POS_CHANNELS_PACKED(coord, sizes) \ - ivec3(coord.x, coord.y, (coord.z + coord.w * sizes.z) / 4) - -#define COORD_TO_BUFFER_IDX(coord, sizes) \ - coord.x + coord.y* sizes.x + coord.z* sizes.y* sizes.x + \ - coord.w* sizes.z* sizes.y* sizes.x; - -#define STRIDE_CHANNELS_PACKED(vec) (vec.x * vec.y) - -#define STRIDE_WIDTH_PACKED(vec) (1) - -#define STRIDE_HEIGHT_PACKED(vec) (vec.x) +#define to_texture_pos_H_packed(idx, sizes) \ + ivec3(idx.x, idx.y / 4, (idx.z + idx.w * sizes.z)) // Given a buffer(1-D) index cur, compute a new index where the corresponding // tensor(N-D)'s adjacent dimensions are swapped. The parameters x,y and plane // describe sizes. As an example, let's say we want to swap dimensions 0,1 for a // tensor of shape {4,3,2,24} to obtain {3,4,2,24}. Then, x=4, y=3 and // plane=2*24=48. -#define SWAP_ADJ_DIMS(cur, x, y, plane) \ +#define swap_adj_dims(cur, x, y, plane) \ cur + \ plane * \ ((1 - y) * ((cur % (x * y * plane)) / (y * plane)) + \ diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl index fe1087f637a..08041490dc9 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl @@ -39,15 +39,15 @@ void main() { ivec3 mat1_pos = ivec3(0, pos.y, pos.z); - $if MAT2_PACKING == "HEIGHT_PACKED": + $if MAT2_PACKING == "H_packed": ivec3 mat2_pos = ivec3(pos.x * 4, 0, pos.z); $else: ivec3 mat2_pos = ivec3(pos.x, 0, pos.z); - $if MAT1_PACKING == "WIDTH_PACKED": - int K = DIVUP4(in_sizes.data[0]); + $if MAT1_PACKING == "W_packed": + int K = divup4(in_sizes.data[0]); for (int i = 0; i < K; ++i) { - $if MAT2_PACKING == "HEIGHT_PACKED": + $if MAT2_PACKING == "H_packed": vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0); vec4 sums = vec4( dot(mat1_tex, texelFetch(im_mat2, mat2_pos, 0)), @@ -59,7 +59,7 @@ void main() { mat1_pos.x++; mat2_pos.y++; - $elif MAT2_PACKING == "WIDTH_PACKED": + $elif MAT2_PACKING == "W_packed": vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0); texel = fma(mat1_tex.xxxx, texelFetch(im_mat2, mat2_pos, 0), texel); mat2_pos.y++; @@ -74,7 +74,7 @@ void main() { $else: $raise Exception("Unsupported value for MAT2_PACKING") } - $elif MAT1_PACKING == "CHANNELS_PACKED" and MAT2_PACKING == "CHANNELS_PACKED": + $elif MAT1_PACKING == "C_packed" and MAT2_PACKING == "C_packed": int K = in_sizes.data[0]; for (int i = 0; i < K; ++i) { texel = fma( diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul.yaml index e1699eb1ee8..ef54dbc722a 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/matmul.yaml @@ -8,18 +8,16 @@ matmul: parameter_names_with_default_values: DTYPE: float NDIM: 3 - MAT1_PACKING: WIDTH_PACKED - MAT2_PACKING: HEIGHT_PACKED + MAT1_PACKING: W_packed + MAT2_PACKING: H_packed generate_variant_forall: DTYPE: - VALUE: float - SUFFIX: float - VALUE: half - SUFFIX: half shader_variants: - NAME: matmul_W_packed_H_packed - NAME: matmul_W_packed_W_packed - MAT2_PACKING: WIDTH_PACKED + MAT2_PACKING: W_packed - NAME: matmul_C_packed_C_packed - MAT1_PACKING: CHANNELS_PACKED - MAT2_PACKING: CHANNELS_PACKED + MAT1_PACKING: C_packed + MAT2_PACKING: C_packed diff --git a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.yaml index 8228ea862e7..3be032bf85d 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.yaml @@ -11,8 +11,6 @@ max_pool2d: generate_variant_forall: DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float shader_variants: - NAME: max_pool2d diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl index 143d3786c05..884f8e06612 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl @@ -10,13 +10,21 @@ #define PRECISION ${PRECISION} +#define BUF_T ${buffer_scalar_type(DTYPE)} +#define VEC4_T ${texel_type(DTYPE)} +#define SCALAR_T ${texel_component_type(DTYPE)} + +#define to_tensor_idx to_tensor_idx_${PACKING} +#define get_packed_dim get_packed_dim_${PACKING} +#define get_packed_stride get_packed_stride_${PACKING} + #include "indexing_utils.h" layout(std430) buffer; layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { - ${T[DTYPE]} data[]; + BUF_T data[]; } buffer_in; @@ -34,31 +42,31 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 coord = POS_TO_COORD_${PACKING}(pos, gpu_sizes.data); + const ivec4 idx = to_tensor_idx(pos, gpu_sizes.data); - if (any(greaterThanEqual(coord, gpu_sizes.data))) { + if (any(greaterThanEqual(idx, gpu_sizes.data))) { return; } - const int base_index = COORD_TO_BUFFER_IDX(coord, cpu_sizes.data); + const int base_index = to_buffer_i(idx, cpu_sizes.data); const ivec4 buf_indices = - base_index + ivec4(0, 1, 2, 3) * STRIDE_${PACKING}(cpu_sizes.data); + base_index + ivec4(0, 1, 2, 3) * get_packed_stride(cpu_sizes.data); - ${T[DTYPE]} val_x = buffer_in.data[buf_indices.x]; - ${T[DTYPE]} val_y = buffer_in.data[buf_indices.y]; - ${T[DTYPE]} val_z = buffer_in.data[buf_indices.z]; - ${T[DTYPE]} val_w = buffer_in.data[buf_indices.w]; + SCALAR_T val_x = SCALAR_T(buffer_in.data[buf_indices.x]); + SCALAR_T val_y = SCALAR_T(buffer_in.data[buf_indices.y]); + SCALAR_T val_z = SCALAR_T(buffer_in.data[buf_indices.z]); + SCALAR_T val_w = SCALAR_T(buffer_in.data[buf_indices.w]); - ${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(val_x, val_y, val_z, val_w); + VEC4_T texel = VEC4_T(val_x, val_y, val_z, val_w); - const int packed_dim_size = PACKED_DIM_${PACKING}(cpu_sizes.data); - int packed_coord = PACKED_DIM_${PACKING}(coord); + const int packed_dim_size = get_packed_dim(cpu_sizes.data); + int packed_idx = get_packed_dim(idx); - if (packed_coord + 3 >= packed_dim_size) { - ivec4 packed_ind = ivec4(packed_coord) + ivec4(0, 1, 2, 3); - ${VEC4_T[DTYPE]} valid_idx = ${VEC4_T[DTYPE]}(lessThan(packed_ind, ivec4(packed_dim_size))); + if (packed_idx + 3 >= packed_dim_size) { + ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3); + VEC4_T valid_idx = VEC4_T(lessThan(packed_ind, ivec4(packed_dim_size))); texel = texel * valid_idx; } - imageStore(image_out, ${GET_POS[NDIM]("pos")}, texel); + imageStore(image_out, ${get_pos[NDIM]("pos")}, texel); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml index ad74d663d6d..64cee382d1f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml @@ -8,22 +8,16 @@ nchw_to_image: parameter_names_with_default_values: NDIM: 3 DTYPE: float - PACKING: CHANNELS_PACKED + PACKING: C_packed generate_variant_forall: PACKING: - - VALUE: CHANNELS_PACKED - SUFFIX: C_packed - - VALUE: WIDTH_PACKED - SUFFIX: W_packed - - VALUE: HEIGHT_PACKED - SUFFIX: H_packed + - VALUE: C_packed + - VALUE: W_packed + - VALUE: H_packed DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float - VALUE: int - SUFFIX: int shader_variants: - NAME: nchw_to_image3d - NAME: nchw_to_image2d diff --git a/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl index 5dade115fd6..7466b530a8c 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl @@ -8,12 +8,10 @@ #version 450 core -#include "broadcasting_utils.h" -#include "indexing_utils.h" - #define PRECISION ${PRECISION} -#define OP(X, Y, A) ${OPERATOR} +#include "broadcasting_utils.h" +#include "indexing_utils.h" layout(std430) buffer; diff --git a/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml index 11971a028fd..f4b77f7b77f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml @@ -6,7 +6,6 @@ no_op: parameter_names_with_default_values: - OPERATOR: X + A * Y NDIM: 3 DTYPE: float generate_variant_forall: @@ -17,10 +16,7 @@ no_op: SUFFIX: 2d DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float - VALUE: int - SUFFIX: int shader_variants: - NAME: no_op diff --git a/backends/vulkan/runtime/graph/ops/glsl/sum_dim.glsl b/backends/vulkan/runtime/graph/ops/glsl/sum_dim.glsl index ed10ec2711c..5fff6be177c 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/sum_dim.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/sum_dim.glsl @@ -8,11 +8,11 @@ #version 450 core +#define PRECISION ${PRECISION} + #include "broadcasting_utils.h" #include "indexing_utils.h" -#define PRECISION ${PRECISION} - layout(std430) buffer; layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; diff --git a/backends/vulkan/runtime/graph/ops/glsl/sum_dim.yaml b/backends/vulkan/runtime/graph/ops/glsl/sum_dim.yaml index 15b8239b84d..de3fddce888 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/sum_dim.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/sum_dim.yaml @@ -11,8 +11,6 @@ sum_dim: generate_variant_forall: DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float shader_variants: - NAME: sum_dim diff --git a/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.glsl b/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.glsl index cd54981f099..3855c4440de 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.glsl @@ -8,10 +8,10 @@ #version 450 core -#include "indexing_utils.h" - #define PRECISION ${PRECISION} +#include "indexing_utils.h" + layout(std430) buffer; layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; diff --git a/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.yaml b/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.yaml index 37635925748..f74bf229e5b 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.yaml @@ -11,8 +11,6 @@ sum_dim_keepdim: generate_variant_forall: DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float shader_variants: - NAME: sum_dim_keepdim diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl index 5c8d4f845cc..3b3db3cc32c 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl @@ -10,7 +10,9 @@ #define PRECISION ${PRECISION} -#define OP(X, A, B) ${OPERATOR} +#define VEC4_T ${texel_type(DTYPE)} + +#define op(X, A, B) ${OPERATOR} layout(std430) buffer; @@ -41,6 +43,6 @@ void main() { return; } - vec4 in_texel = texelFetch(image_in, pos, 0); - imageStore(image_out, pos, OP(in_texel, minimum.data, maximum.data)); + VEC4_T in_texel = texelFetch(image_in, pos, 0); + imageStore(image_out, pos, op(in_texel, minimum.data, maximum.data)); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml index a4cfa38432d..c32593d700c 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml @@ -6,9 +6,7 @@ unary_op: generate_variant_forall: DTYPE: - VALUE: half - SUFFIX: half - VALUE: float - SUFFIX: float shader_variants: - NAME: abs OPERATOR: abs(X) diff --git a/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl b/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl index f5e5d6b4e4d..7f72ac58972 100644 --- a/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl +++ b/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl @@ -7,11 +7,10 @@ */ #version 450 core -// clang-format off + #define PRECISION ${PRECISION} -#define OP(X, Y) ${OPERATOR} -// clang-format on +#define op(X, Y) ${OPERATOR} layout(std430) buffer; @@ -38,5 +37,5 @@ void main() { vec4 in_texel = texelFetch(image_in, pos, 0); vec4 other_texel = texelFetch(image_other, pos, 0); - imageStore(image_out, pos, OP(in_texel, other_texel)); + imageStore(image_out, pos, op(in_texel, other_texel)); } diff --git a/backends/vulkan/test/glsl/fill_texture__test.glsl b/backends/vulkan/test/glsl/fill_texture__test.glsl index fafad11d498..76c630de55e 100644 --- a/backends/vulkan/test/glsl/fill_texture__test.glsl +++ b/backends/vulkan/test/glsl/fill_texture__test.glsl @@ -7,15 +7,12 @@ */ #version 450 core + #define PRECISION ${PRECISION} layout(std430) buffer; -/* Qualifiers: layout - storage - precision - memory */ - -// clang-format off layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} uOutput; -// clang-format on layout(set = 0, binding = 1) uniform PRECISION restrict Block { ivec3 size; int fill; diff --git a/backends/vulkan/test/glsl/idx_fill_texture.glsl b/backends/vulkan/test/glsl/idx_fill_texture.glsl index a6500bd3ede..b821f8436fc 100644 --- a/backends/vulkan/test/glsl/idx_fill_texture.glsl +++ b/backends/vulkan/test/glsl/idx_fill_texture.glsl @@ -10,6 +10,8 @@ #define PRECISION ${PRECISION} +#define VEC4_T ${texel_type(DTYPE)} + #include "indexing_utils.h" layout(std430) buffer; @@ -40,7 +42,7 @@ void main() { const ivec4 buf_indices = base_index + ivec4(0, 1, 2, 3) * PLANE_SIZE_${PACKING}(gpu_sizes.data); - ${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(buf_indices); + VEC4_T texel = VEC4_T(buf_indices); - imageStore(image_out, ${GET_POS[NDIM]("pos")}, texel); + imageStore(image_out, ${get_pos[NDIM]("pos")}, texel); } diff --git a/backends/vulkan/test/glsl/test_shader.glsl b/backends/vulkan/test/glsl/test_shader.glsl index 39edc92cc62..4804528346d 100644 --- a/backends/vulkan/test/glsl/test_shader.glsl +++ b/backends/vulkan/test/glsl/test_shader.glsl @@ -7,16 +7,14 @@ */ #version 450 core + #define PRECISION ${PRECISION} -#define FORMAT ${FORMAT} layout(std430) buffer; -/* Qualifiers: layout - storage - precision - memory */ - -layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput; -layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; -layout(set = 0, binding = 2) uniform PRECISION restrict Block { +layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; +layout(set = 0, binding = 2) uniform PRECISION restrict Block { ivec4 size; } uBlock;