From 2a3f79616792f35c822dae0ddcae93c051aa32e4 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 4 Apr 2024 15:12:27 -0700
Subject: [PATCH] [ET-VK][Ez] Fix Validation Layer warnings about wrong image
 layout

## Context

Currently, when executing a `ComputeGraph` with prepacked tensors with [Vulkan Validation Layers](https://github.com/KhronosGroup/Vulkan-ValidationLayers) turned on, the following Validation Errors can be observed. Note that Validation Layers can be turned on by running Vulkan binaries on Mac with the `vkconfig` app opened.

```
UNASSIGNED-CoreValidation-DrawState-InvalidImageLayout(ERROR / SPEC): msgNum: 1303270965 - Validation Error: [ UNASSIGNED-CoreValidation-DrawState-InvalidImageLayout ] Object 0: handle = 0x7fb76dbbf988, type = VK_OBJECT_TYPE_COMMAND_BUFFER; | MessageID = 0x4dae5635 | vkQueueSubmit(): pSubmits[0].pCommandBuffers[0] command buffer VkCommandBuffer 0x7fb76dbbf988[] expects VkImage 0xd79c8a0000000f09[] (subresource: aspectMask 0x1 array layer 0, mip level 0) to be in layout VK_IMAGE_LAYOUT_GENERAL--instead, current layout is VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL.
    Objects: 1
        [0] 0x7fb76dbbf988, type: 6, name: NULL
```

The reason for this is that prepacked textures are written to with `WRITE` memory access during packing, which means they will be in the `VK_IMAGE_LAYOUT_GENERAL` layout. However, they will subsequently be read from during `graph.execute()`, meaning the texture will have transitioned to `VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL`, but will be bound using the `VK_IMAGE_LAYOUT_GENERAL` layout. Subsequent calls to `execute()` will therefore see that the prepacked texture has been bound with the wrong layout, since after the first graph execution the texture will have the `VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL` layout.

The solution is to submit a no-op shader dispatch during prepacking to trigger a transition to the `READ_ONLY_OPTIMAL` layout.

Differential Revision: [D55772003](https://our.internmc.facebook.com/intern/diff/D55772003/)

[ghstack-poisoned]
---
 .../vulkan/runtime/graph/ops/PrepackNode.cpp  | 61 ++++++++++++++-----
 .../vulkan/runtime/graph/ops/PrepackNode.h    |  1 +
 .../vulkan/runtime/graph/ops/glsl/no_op.glsl  | 27 ++++++++
 .../vulkan/runtime/graph/ops/glsl/no_op.yaml  | 26 ++++++++
 .../graph/ops/utils/ShaderNameUtils.cpp       | 13 ++++
 .../runtime/graph/ops/utils/ShaderNameUtils.h |  2 +
 6 files changed, 114 insertions(+), 16 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/no_op.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/no_op.yaml

diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
index 60d1982d97e..db6bdb75376 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -11,10 +11,19 @@
 #include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 
 namespace vkcompute {
 
+api::ShaderInfo get_noop_shader(ComputeGraph& graph, const ValueRef packed) {
+  std::stringstream noop_shader_name;
+  noop_shader_name << "no_op";
+  apply_ndim_suffix(noop_shader_name, graph.get_val(packed).toTensor());
+  apply_dtype_suffix(noop_shader_name, graph.get_val(packed).toTensor());
+  return VK_KERNEL_FROM_STR(noop_shader_name.str());
+}
+
 PrepackNode::PrepackNode(
     ComputeGraph& graph,
     const api::ShaderInfo& shader,
@@ -24,17 +33,18 @@ PrepackNode::PrepackNode(
     const ValueRef packed,
     const std::vector<std::shared_ptr<api::UniformParamsBuffer>>& params)
     : shader_(shader),
+      noop_shader_(get_noop_shader(graph, packed)),
       global_workgroup_size_(global_workgroup_size),
       local_workgroup_size_(local_workgroup_size),
       tref_(tref),
       packed_(packed),
       params_(params) {
   graph.update_descriptor_counts(shader, /*execute = */ false);
+  graph.update_descriptor_counts(noop_shader_, /*execute = */ false);
 }
 
 void PrepackNode::encode(ComputeGraph* graph) {
   api::Context* const context = graph->context();
-  api::PipelineBarrier pipeline_barrier{};
 
   TensorRef tref = graph->get_val(tref_).toTensorRef();
   vTensor packed = graph->get_val(packed_).toTensor();
@@ -46,21 +56,40 @@ void PrepackNode::encode(ComputeGraph* graph) {
 
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
-  api::DescriptorSet descriptor_set =
-      context->get_descriptor_set(shader_, local_workgroup_size_);
-
-  uint32_t idx = 0;
-  bind_tensor_to_descriptor_set(
-      packed,
-      pipeline_barrier,
-      api::MemoryAccessType::WRITE,
-      descriptor_set,
-      idx++);
-  bind_staging_to_descriptor_set(staging, descriptor_set, idx++);
-  bind_params_to_descriptor_set(params_, descriptor_set, idx);
-
-  context->register_shader_dispatch(
-      descriptor_set, pipeline_barrier, shader_, global_workgroup_size_);
+  {
+    api::PipelineBarrier pipeline_barrier{};
+    api::DescriptorSet descriptor_set =
+        context->get_descriptor_set(shader_, local_workgroup_size_);
+
+    uint32_t idx = 0;
+    bind_tensor_to_descriptor_set(
+        packed,
+        pipeline_barrier,
+        api::MemoryAccessType::WRITE,
+        descriptor_set,
+        idx++);
+    bind_staging_to_descriptor_set(staging, descriptor_set, idx++);
+    bind_params_to_descriptor_set(params_, descriptor_set, idx);
+
+    context->register_shader_dispatch(
+        descriptor_set, pipeline_barrier, shader_, global_workgroup_size_);
+  }
+
+  {
+    api::PipelineBarrier pipeline_barrier{};
+    api::DescriptorSet descriptor_set =
+        context->get_descriptor_set(noop_shader_, {1, 1, 1});
+
+    bind_tensor_to_descriptor_set(
+        packed,
+        pipeline_barrier,
+        api::MemoryAccessType::READ,
+        descriptor_set,
+        0);
+
+    context->register_shader_dispatch(
+        descriptor_set, pipeline_barrier, noop_shader_, {1, 1, 1});
+  }
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h
index dd31be12b37..f894e179407 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.h
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h
@@ -41,6 +41,7 @@ class PrepackNode final {
 
  protected:
   const api::ShaderInfo shader_;
+  api::ShaderInfo noop_shader_;
   const api::utils::uvec3 global_workgroup_size_;
   const api::utils::uvec3 local_workgroup_size_;
   const ValueRef tref_;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl
new file mode 100644
index 00000000000..f19f8deb7d6
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#include "broadcasting_utils.h"
+#include "indexing_utils.h"
+
+#define PRECISION ${PRECISION}
+
+#define OP(X, Y, A) ${OPERATOR}
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} image_in;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  ${VEC4_T[DTYPE]} tex = texelFetch(image_in, ${GET_POS[NDIM]("pos")}, 0);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml
new file mode 100644
index 00000000000..11971a028fd
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml
@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+no_op:
+  parameter_names_with_default_values:
+    OPERATOR: X + A * Y
+    NDIM: 3
+    DTYPE: float
+  generate_variant_forall:
+    NDIM:
+      - VALUE: 3
+        SUFFIX: 3d
+      - VALUE: 2
+        SUFFIX: 2d
+    DTYPE:
+      - VALUE: half
+        SUFFIX: half
+      - VALUE: float
+        SUFFIX: float
+      - VALUE: int
+        SUFFIX: int
+  shader_variants:
+    - NAME: no_op
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
index 9bdb3c4be58..4f29e93e161 100644
--- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
@@ -26,6 +26,19 @@ void apply_dtype_suffix(std::stringstream& kernel_name, const vTensor& tensor) {
   }
 }
 
+void apply_ndim_suffix(std::stringstream& kernel_name, const vTensor& tensor) {
+  switch (tensor.storage_type()) {
+    case api::StorageType::TEXTURE_3D:
+      kernel_name << "_3d";
+      break;
+    case api::StorageType::TEXTURE_2D:
+      kernel_name << "_2d";
+      break;
+    default:
+      break;
+  }
+}
+
 void apply_memory_layout_suffix(
     std::stringstream& kernel_name,
     const vTensor& tensor) {
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
index 3f094432bb1..3da972b224c 100644
--- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
@@ -16,6 +16,8 @@ namespace vkcompute {
 
 void apply_dtype_suffix(std::stringstream& kernel_name, const vTensor& tensor);
 
+void apply_ndim_suffix(std::stringstream& kernel_name, const vTensor& tensor);
+
 void apply_memory_layout_suffix(
     std::stringstream& kernel_name,
     const vTensor& tensor);